# Importing

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import RandomizedSearchCV

In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s3e14/train.csv')

In [None]:
# Dropping id column
df = df.drop('id', axis=1)

In [None]:
df.shape

In [None]:
df.head()

# Exploratory Data Analysis

In [None]:
# Visualizing missing values if any
msno.matrix(df)

In [None]:
# Printing number of missing values if any
df.isnull().sum() / df.shape[0]

**<span style='color:red'>No mssing values!</span>**

In [None]:
df.hist(bins=10, figsize=(20,15))
plt.show()

In [None]:
import matplotlib.pyplot as plt

# assuming df is your dataframe
target = 'yield'
num_cols = df.shape[1] - 1
num_rows = (num_cols - 1) // 3 + 1
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 15))
axs = axs.flatten()

for i, col in enumerate(df.columns):
    if col != target:
        axs[i].scatter(df[col], df[target], s=10)
        axs[i].set_xlabel(col)
        axs[i].set_ylabel(target)
plt.tight_layout()
plt.show()


**<span style='color:red'>All features are quantative and not very skewed. I will quantile transform these features (except target - yield) and then scale to range between 0 and 1 using min-max scaling for efficient trainning of models.</span>**

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(18, 15))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

**<span style='color:red'>The features 'clonesize', 'RainingDays', and 'AverageRainingDays' exhibit a high negative correlation with the target variable 'yield', while 'honeybee' has a slightly weaker negative correlation.</span>**

**<span style='color:red'>On the other hand, 'fruitset', 'fruitmass', and 'seeds' are very highly positively correlated with 'yield', while 'bumbles' and 'osmia' have moderate positive correlations.</span>**

# Transforming data

In [None]:
# Creating independent features df
X = df.loc[:, 'clonesize':'seeds']
X.head()

In [None]:
X.shape

In [None]:
# Seperating target feature
y = df['yield']
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating base models & selecting

**The competition will evaluate submissions based on Mean Absolute Error (MAE). So, I'll use the same for evaluating my models.**

In [None]:
'''Linear Regression'''

LinearRegression_pipeline = Pipeline([
    ('quantile_transformer', QuantileTransformer()),
    ('linear_regression', LinearRegression())
])

# training the pipeline on the training set
LinearRegression_pipeline.fit(X_train, y_train)

# predicting on the testing set
y_pred = LinearRegression_pipeline.predict(X_test)

# evaluating the performance using MAE
mae = mean_absolute_error(y_test, y_pred)

print("LinearRegression MAE:", mae)

In [None]:
'''XGBRegressor'''

# specifing the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # specify the objective function
    'eval_metric': 'mae',  # specify the evaluation metric
    'tree_method': 'gpu_hist',  # use GPU to build trees
    'gpu_id': 0  # specify the GPU device to use
}

# creating an XGBoost regressor
xgb_model = xgb.XGBRegressor(**params)

# fitting the model on the training data
xgb_model.fit(X_train, y_train)

# predicting on the test data
y_pred = xgb_model.predict(X_test)

# evaluating the model using MAE
mae = mean_absolute_error(y_test, y_pred)
print("XGBRegressor MAE:", mae)

In [None]:
'''Decision Tree'''

# Creating a decision tree regressor
tree = DecisionTreeRegressor(random_state=42)

# Fitting the model on the training data
tree.fit(X_train, y_train)

# Making predictions on the test data
y_pred = tree.predict(X_test)

# Evaluating the model using mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("DecisionTreeRegressor MAE:", mae)

In [None]:
'''Random Forest'''

RandomForestRegressor_pipeline = make_pipeline(
    QuantileTransformer(),
    RandomForestRegressor()
)

# Fit the pipeline on the training data
RandomForestRegressor_pipeline.fit(X_train, y_train)

# Predict on the testing data
y_pred = RandomForestRegressor_pipeline.predict(X_test)

# Evaluate the model using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("RandomForestRegressor MAE:", mae)

In [None]:
'''Support Vector Regression (SVR)'''

SVR_pipeline = Pipeline([
    ('quantile_transformer', QuantileTransformer()),
    ('regressor', SVR())
])

# Fit the pipeline on the training data
SVR_pipeline.fit(X_train, y_train)

# Predict on the testing data
y_pred = SVR_pipeline.predict(X_test)

# Evaluate the model using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("SVR MAE:", mae)

In [None]:
'''Neural network'''

NN_pipeline = Pipeline([
    ('transformer', QuantileTransformer()),
    ('estimator', MLPRegressor(learning_rate_init=0.06))
])

# Fitting the pipeline on the training data
NN_pipeline.fit(X_train, y_train)

# Predict on the testing data
y_pred = NN_pipeline.predict(X_test)

# Evaluate the model using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("Neural Network MAE:", mae)

**<span style='color:red'>XGBRegressor and RandomForestRegressor base models have best MAE score out of all other models created. So, well use ths 2 model and performhyperparameter tuning to find the best model.</span>**

# Hyperparameter tuning XGBRegressor

In [None]:
# params = {
#     'n_estimators': [150, 175, 200, 225, 250],
#     'max_depth': [3, 4, 5, 6, 7],
#     'learning_rate': [0.01, 0.025, 0.05, 0.1, 0.2],
#     'min_child_weight': [1, 2, 3, 4, 5],
#     'subsample' : [0.6, 0.62, 0.64, 0.66, 0.68]
# }

# # specifing the parameters for XGBoost
# params = {
#     'eval_metric': 'mae',  # specify the evaluation metric
#     'tree_method': 'gpu_hist',  # use GPU to build trees
#     'gpu_id': 0  # specify the GPU device to use
# }

# # creating an XGBoost regressor
# xgb_regressor = xgb.XGBRegressor(**params)

# random_cv = RandomizedSearchCV(estimator=xgb_regressor,
#             param_distributions=hyperparameter_grid,
#             cv=5, n_iter=20,
#             scoring = 'neg_mean_absolute_error',n_jobs = 4,
#             verbose = 2, 
#             return_train_score = True,
#             random_state=42)

# random_cv.fit(X_train,y_train)

# random_cv.best_estimator_

**Performing manual hyperparameter tuning**

In [None]:
# specifing the parameters for XGBoost
params = {
    'max_bin' : 230,
    'eval_metric': 'mae',  # specify the evaluation metric
    'tree_method': 'gpu_hist',  # use GPU to build trees
    'predictor' : 'gpu_predictor',
    'n_estimators': 179,
    'max_depth': 5,
    'learning_rate': 0.03811,
    'min_child_weight': 5,
    'subsample' : 0.623,
    'num_parallel_tree' : 1
}

# creating an XGBoost regressor
xgb_model = xgb.XGBRegressor(**params)




# fitting the model on the training data
xgb_model.fit(X_train, y_train)

# predicting on the test data
y_pred = xgb_model.predict(X_test)

# evaluating the model using MAE
mae = mean_absolute_error(y_test, y_pred)
print("XGBRegressor MAE:", mae)

# Predicting test data

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s3e14/test.csv')

# Dropping id column
test_df = test_df.drop('id', axis=1)

test_df.head()

In [None]:
test_df.shape

In [None]:
test_predictions = xgb_model.predict(test_df)
test_predictions.shape

In [None]:
test_predictions

# Submission

In [None]:
submission = pd.read_csv('/kaggle/input/playground-series-s3e14/sample_submission.csv')
submission.head()

In [None]:
submission['yield'] = test_predictions
submission.head()

In [None]:
# Saving
submission.to_csv('submission.csv', index=False)