In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
all_data = pd.read_csv("../processing_2/data_all.csv", parse_dates=['Date'])

In [3]:
all_data.columns

Index(['Campagne', 'Region', 'Site', 'Famille', 'Variete', 'Num Parcelle',
       'CodeTracabilite', 'Date', 'Tonnage', 'is_train', 'Recolte',
       'irrigation_cumulee_1', 'irrigation_cumulee_2', 'irrigation_cumulee_3',
       'irrigation_cumulee_4', 'irrigation_cumulee_5', 'irrigation_cumulee_6',
       'irrigation_cumulee_7', 'irrigation_cumulee_8', 'irrigation_cumulee_9',
       'irrigation_cumulee_10', 'irrigation_cumulee_11',
       'irrigation_cumulee_12', 'N_cumulee_1', 'P_cumulee_1', 'K_cumulee_1',
       'N_cumulee_2', 'P_cumulee_2', 'K_cumulee_2', 'N_cumulee_3',
       'P_cumulee_3', 'K_cumulee_3', 'N_cumulee_4', 'P_cumulee_4',
       'K_cumulee_4', 'N_cumulee_5', 'P_cumulee_5', 'K_cumulee_5',
       'N_cumulee_6', 'P_cumulee_6', 'K_cumulee_6', 'N_cumulee_7',
       'P_cumulee_7', 'K_cumulee_7', 'N_cumulee_8', 'P_cumulee_8',
       'K_cumulee_8', 'N_cumulee_9', 'P_cumulee_9', 'K_cumulee_9',
       'N_cumulee_10', 'P_cumulee_10', 'K_cumulee_10', 'N_cumulee_11',
       'P_cum

In [4]:
all_data.shape

(3943, 69)

In [5]:
# Tonnage to float
all_data['Tonnage'] = all_data['Tonnage'].str.replace(',', '.').astype(float)

# Split the date
all_data['Year'] = all_data['Date'].dt.year
all_data['Month'] = all_data['Date'].dt.month
all_data['Day'] = all_data['Date'].dt.day


duplicates = all_data.duplicated(['CodeTracabilite', 'Num Parcelle', 'Campagne', 'Region', 'Site', 'Famille', 'Variete', 'Recolte'], keep=False)

# Then, apply the condition
condition = (all_data['Sup Debut Camp'] + all_data['Sup Plantee'] != all_data['Sup Arrachee'] + all_data['Sup Fin Camp'])

# Combine both conditions
to_drop = duplicates & condition

# Drop the rows that satisfy both condgitions
all_data = all_data[~to_drop]

In [6]:
for month_index in range(12):
    all_data[f'irrigation_cumulee_{month_index + 1}'] = all_data[f'irrigation_cumulee_{month_index + 1}'].replace(0, np.nan)

In [7]:
all_data.shape

(3903, 72)

In [8]:
all_data.columns

Index(['Campagne', 'Region', 'Site', 'Famille', 'Variete', 'Num Parcelle',
       'CodeTracabilite', 'Date', 'Tonnage', 'is_train', 'Recolte',
       'irrigation_cumulee_1', 'irrigation_cumulee_2', 'irrigation_cumulee_3',
       'irrigation_cumulee_4', 'irrigation_cumulee_5', 'irrigation_cumulee_6',
       'irrigation_cumulee_7', 'irrigation_cumulee_8', 'irrigation_cumulee_9',
       'irrigation_cumulee_10', 'irrigation_cumulee_11',
       'irrigation_cumulee_12', 'N_cumulee_1', 'P_cumulee_1', 'K_cumulee_1',
       'N_cumulee_2', 'P_cumulee_2', 'K_cumulee_2', 'N_cumulee_3',
       'P_cumulee_3', 'K_cumulee_3', 'N_cumulee_4', 'P_cumulee_4',
       'K_cumulee_4', 'N_cumulee_5', 'P_cumulee_5', 'K_cumulee_5',
       'N_cumulee_6', 'P_cumulee_6', 'K_cumulee_6', 'N_cumulee_7',
       'P_cumulee_7', 'K_cumulee_7', 'N_cumulee_8', 'P_cumulee_8',
       'K_cumulee_8', 'N_cumulee_9', 'P_cumulee_9', 'K_cumulee_9',
       'N_cumulee_10', 'P_cumulee_10', 'K_cumulee_10', 'N_cumulee_11',
       'P_cum

In [9]:
# all_data['ndmi_missingness'] = all_data['ndmi'].apply(
#     lambda x: 0 if x != 0 else 1)

In [10]:
all_data['day'] = all_data['Date'].apply(lambda x: x.day)
all_data['month'] = all_data['Date'].apply(lambda x: x.month)
all_data['year'] = all_data['Date'].apply(lambda x: x.year)

Processing 

In [11]:
from sklearn.preprocessing import OneHotEncoder


cat_cols = ['Region', 'Site', 'Variete', 'Porte Greffe']
cat_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [12]:
from sklearn.impute import SimpleImputer


cols_to_impute_mean = ['Days to Arrachage', 'Tree Age'] + ['Sup Debut Camp']
mean_imputer = SimpleImputer(strategy='mean', add_indicator=True)

In [13]:
irrigation_columns = [f'irrigation_cumulee_{i}' for i in range(1, 13)]

# Calculate the median for these columns in each row
row_mean = all_data[irrigation_columns].mean(axis=1)

# Replace NaN values in each column with the row median
for col in irrigation_columns:
    all_data[col] = all_data[col].fillna(row_mean)

In [14]:
# Identify the columns to impute with mean
cols_to_impute_mean = ['Days to Arrachage', 'Tree Age', 'Sup Debut Camp'] + irrigation_columns

# Initialize the mean imputer with an option to add indicators for imputed values
mean_imputer = SimpleImputer(strategy='mean', add_indicator=True)

# Apply the imputer to the specified columns
imputed_data = mean_imputer.fit_transform(all_data[cols_to_impute_mean])

# Since the imputer also adds indicators for imputation, 
# update the column names to include indicator columns
imputed_cols = cols_to_impute_mean + [col + '_imputed' for col in cols_to_impute_mean]

# Update the dataframe with the imputed data
all_data[imputed_cols] = imputed_data

# Optionally, you may want to update the original columns with the imputed values
# and keep the indicators as separate columns
for i, col in enumerate(cols_to_impute_mean):
    all_data[col] = imputed_data[:, i]
    all_data[col + '_imputed'] = imputed_data[:, i + len(cols_to_impute_mean)]


In [15]:
all_data.columns

Index(['Campagne', 'Region', 'Site', 'Famille', 'Variete', 'Num Parcelle',
       'CodeTracabilite', 'Date', 'Tonnage', 'is_train', 'Recolte',
       'irrigation_cumulee_1', 'irrigation_cumulee_2', 'irrigation_cumulee_3',
       'irrigation_cumulee_4', 'irrigation_cumulee_5', 'irrigation_cumulee_6',
       'irrigation_cumulee_7', 'irrigation_cumulee_8', 'irrigation_cumulee_9',
       'irrigation_cumulee_10', 'irrigation_cumulee_11',
       'irrigation_cumulee_12', 'N_cumulee_1', 'P_cumulee_1', 'K_cumulee_1',
       'N_cumulee_2', 'P_cumulee_2', 'K_cumulee_2', 'N_cumulee_3',
       'P_cumulee_3', 'K_cumulee_3', 'N_cumulee_4', 'P_cumulee_4',
       'K_cumulee_4', 'N_cumulee_5', 'P_cumulee_5', 'K_cumulee_5',
       'N_cumulee_6', 'P_cumulee_6', 'K_cumulee_6', 'N_cumulee_7',
       'P_cumulee_7', 'K_cumulee_7', 'N_cumulee_8', 'P_cumulee_8',
       'K_cumulee_8', 'N_cumulee_9', 'P_cumulee_9', 'K_cumulee_9',
       'N_cumulee_10', 'P_cumulee_10', 'K_cumulee_10', 'N_cumulee_11',
       'P_cum

In [16]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the categorical columns to be transformed
# cat_cols = ['Region', 'Site', 'Variete', 'Porte Greffe']
cat_cols = ['Variete', 'Porte Greffe']

# Create a transformer for categorical features
cat_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Create a column transformer to apply the transformation only to the specified categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols)
    ],
    remainder='passthrough'  # this will pass through other columns not listed in transformers
)

# Apply the transformations
all_data_transformed = preprocessor.fit_transform(all_data)

# The output will be a NumPy array. Convert it back to a dataframe if necessary
# Get feature names after one-hot encoding
new_cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
non_cat_cols = all_data.drop(columns=cat_cols).columns
all_columns = list(new_cat_features) + list(non_cat_cols)

# Create a new dataframe with transformed features
all_data = pd.DataFrame(all_data_transformed, columns=all_columns)

# Check the transformed dataframe
all_data.columns



Index(['Variete_AF1', 'Variete_AF2', 'Variete_AF3', 'Variete_CLA1',
       'Variete_CLA10', 'Variete_CLA11', 'Variete_CLA12', 'Variete_CLA13',
       'Variete_CLA14', 'Variete_CLA15',
       ...
       'irrigation_cumulee_3_imputed', 'irrigation_cumulee_4_imputed',
       'irrigation_cumulee_5_imputed', 'irrigation_cumulee_6_imputed',
       'irrigation_cumulee_7_imputed', 'irrigation_cumulee_8_imputed',
       'irrigation_cumulee_9_imputed', 'irrigation_cumulee_10_imputed',
       'irrigation_cumulee_11_imputed', 'irrigation_cumulee_12_imputed'],
      dtype='object', length=153)

In [17]:
num_cols = ['Sup Debut Camp', 'Tree Age', 'Days to Arrachage', 'Recolte']

imputation_indicator = [col + '_imputed' for col in cols_to_impute_mean]

monthly_params = ['N_cumulee', 'P_cumulee', 'K_cumulee', 'irrigation_cumulee']

monthly_variables = [
    f'{param}_{i}' for param in monthly_params for i in range(1, 13)]

time_variables = ['year', 'month']


# CHANGE HERE

predictors_names = monthly_variables + num_cols + imputation_indicator + list(new_cat_features)
target_name = ['Tonnage']

In [18]:
print(predictors_names)

['N_cumulee_1', 'N_cumulee_2', 'N_cumulee_3', 'N_cumulee_4', 'N_cumulee_5', 'N_cumulee_6', 'N_cumulee_7', 'N_cumulee_8', 'N_cumulee_9', 'N_cumulee_10', 'N_cumulee_11', 'N_cumulee_12', 'P_cumulee_1', 'P_cumulee_2', 'P_cumulee_3', 'P_cumulee_4', 'P_cumulee_5', 'P_cumulee_6', 'P_cumulee_7', 'P_cumulee_8', 'P_cumulee_9', 'P_cumulee_10', 'P_cumulee_11', 'P_cumulee_12', 'K_cumulee_1', 'K_cumulee_2', 'K_cumulee_3', 'K_cumulee_4', 'K_cumulee_5', 'K_cumulee_6', 'K_cumulee_7', 'K_cumulee_8', 'K_cumulee_9', 'K_cumulee_10', 'K_cumulee_11', 'K_cumulee_12', 'irrigation_cumulee_1', 'irrigation_cumulee_2', 'irrigation_cumulee_3', 'irrigation_cumulee_4', 'irrigation_cumulee_5', 'irrigation_cumulee_6', 'irrigation_cumulee_7', 'irrigation_cumulee_8', 'irrigation_cumulee_9', 'irrigation_cumulee_10', 'irrigation_cumulee_11', 'irrigation_cumulee_12', 'Sup Debut Camp', 'Tree Age', 'Days to Arrachage', 'Recolte', 'Days to Arrachage_imputed', 'Tree Age_imputed', 'Sup Debut Camp_imputed', 'irrigation_cumulee_1_

### Split and train model

In [19]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

In [20]:
# non_zero_na_sums = all_data[predictors_names].isna().sum()
# non_zero_na_sums = non_zero_na_sums[non_zero_na_sums != 0]
# non_zero_na_sums

In [21]:
train = all_data[all_data['is_train'] == True]
test = all_data[all_data['is_train'] == False]

In [22]:
train = train.sample(frac=1, random_state=42)  # random_state for reproducibility
X, y = train[predictors_names].values, train[target_name].values

hyperparams tuning

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a RandomForest Regressor
selected_model = RandomForestRegressor(random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=selected_model, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_root_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X, y)

# Best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Best estimator
best_rf = grid_search.best_estimator_

# Predictions and evaluation (optional)
# predictions = best_rf.predict(X_test)
# Print the best hyperparameters and corresponding R² score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best RMSE Score: ", grid_search.best_score_)


Fitting 3 folds for each of 324 candidates, totalling 972 fits


324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
146 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DataScience\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DataScience\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\DataScience\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\DataScience\AppData\Local\Pro

Best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Best Hyperparameters:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Best RMSE Score:  -36091.1114263124


In [26]:
best_params = {'learning_rate': 0.05, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
# Create Extra Trees model using the best parameters
best_model = LGBMRegressor(n_estimators=best_params['n_estimators'],
                                             max_depth=best_params['max_depth'],
                                             max_features=best_params['max_features'],
                                             min_samples_leaf=best_params['min_samples_leaf'],
                                             min_samples_split=best_params['min_samples_split'],
                                             learning_rate=best_params['learning_rate'])

In [27]:
k = 4
kf = KFold(n_splits=k)
final_results = {}

train_rmse_scores = []
train_r2_scores = []
test_rmse_scores = []
test_r2_scores = []

for train_index, test_index in kf.split(X):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    best_model.fit(x_train_fold, y_train_fold)
    y_pred_train = best_model.predict(x_train_fold)
    y_pred_test = best_model.predict(x_test_fold)

    r2_train = r2_score(y_train_fold, y_pred_train)
    rmse_train = mean_squared_error(
        y_train_fold, y_pred_train, squared=False)
    r2_test = r2_score(y_test_fold, y_pred_test)
    rmse_test = mean_squared_error(y_test_fold, y_pred_test, squared=False)

    train_rmse_scores.append(rmse_train)
    train_r2_scores.append(r2_train)
    test_rmse_scores.append(rmse_test)
    test_r2_scores.append(r2_test)

avg_train_rmse = sum(train_rmse_scores) / k
avg_train_r2 = sum(train_r2_scores) / k
avg_test_rmse = sum(test_rmse_scores) / k
avg_test_r2 = sum(test_r2_scores) / k


final_results["metrics"] = {
    "RMSE train": avg_train_rmse,
    "RMSE test": avg_test_rmse,
    "R² train": avg_train_r2,
    "R² test": avg_test_r2
}

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12658
[LightGBM] [Info] Number of data points in the train set: 2341, number of used features: 126
[LightGBM] [Info] Start training from score 20375.566419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12635
[LightGBM] [Info] Number of data points in the train set: 2341, number of used features: 126
[LightGBM] [Info] Start training from score 21465.653424


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12622
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 126
[LightGBM] [Info] Start training from score 20376.407872


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003982 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12687
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 128
[LightGBM] [Info] Start training from score 20929.019016


  y = column_or_1d(y, warn=True)




In [28]:
best_model.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13036
[LightGBM] [Info] Number of data points in the train set: 3122, number of used features: 128
[LightGBM] [Info] Start training from score 20786.633080


  y = column_or_1d(y, warn=True)


In [29]:
X_sub = test[predictors_names].values

y_sub = best_model.predict(X_sub)



export

In [30]:
y_sub.shape

(781,)

In [31]:
y_pred = np.array(y_sub)  # Convert to numpy array if not already
assert len(y_pred) == 781, "y_pred must have exactly 781 rows"

# Create a DataFrame. Adjust column names as per the competition's requirement.
# Usually, you will have an ID column and a prediction column.
submission_df = pd.DataFrame({
    'Id': range(1, 782),  # Example: creating a sequence of IDs from 1 to 781
    'Tonnage': y_pred
})

# Export to CSV
csv_file = "submission.csv"
submission_df.to_csv(csv_file, index=False)

In [32]:
## Linear reg with regularization