In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [3]:
all_data = pd.read_csv("../processing_2/data_all.csv", parse_dates=['Date'])

In [4]:
all_data.columns

Index(['Campagne', 'Region', 'Site', 'Famille', 'Variete', 'Num Parcelle',
       'CodeTracabilite', 'Date', 'Tonnage', 'is_train', 'Recolte',
       'irrigation_cumulee_1', 'irrigation_cumulee_2', 'irrigation_cumulee_3',
       'irrigation_cumulee_4', 'irrigation_cumulee_5', 'irrigation_cumulee_6',
       'irrigation_cumulee_7', 'irrigation_cumulee_8', 'irrigation_cumulee_9',
       'irrigation_cumulee_10', 'irrigation_cumulee_11',
       'irrigation_cumulee_12', 'N_cumulee_1', 'P_cumulee_1', 'K_cumulee_1',
       'N_cumulee_2', 'P_cumulee_2', 'K_cumulee_2', 'N_cumulee_3',
       'P_cumulee_3', 'K_cumulee_3', 'N_cumulee_4', 'P_cumulee_4',
       'K_cumulee_4', 'N_cumulee_5', 'P_cumulee_5', 'K_cumulee_5',
       'N_cumulee_6', 'P_cumulee_6', 'K_cumulee_6', 'N_cumulee_7',
       'P_cumulee_7', 'K_cumulee_7', 'N_cumulee_8', 'P_cumulee_8',
       'K_cumulee_8', 'N_cumulee_9', 'P_cumulee_9', 'K_cumulee_9',
       'N_cumulee_10', 'P_cumulee_10', 'K_cumulee_10', 'N_cumulee_11',
       'P_cum

In [5]:
all_data.shape

(3943, 69)

In [6]:
# Tonnage to float
all_data['Tonnage'] = all_data['Tonnage'].str.replace(',', '.').astype(float)

# Split the date
all_data['Year'] = all_data['Date'].dt.year
all_data['Month'] = all_data['Date'].dt.month
all_data['Day'] = all_data['Date'].dt.day


duplicates = all_data.duplicated(['CodeTracabilite', 'Num Parcelle', 'Campagne', 'Region', 'Site', 'Famille', 'Variete', 'Recolte'], keep=False)

# Then, apply the condition
condition = (all_data['Sup Debut Camp'] + all_data['Sup Plantee'] != all_data['Sup Arrachee'] + all_data['Sup Fin Camp'])

# Combine both conditions
to_drop = duplicates & condition

# Drop the rows that satisfy both condgitions
all_data = all_data[~to_drop]

In [7]:
for month_index in range(12):
    all_data[f'irrigation_cumulee_{month_index + 1}'] = all_data[f'irrigation_cumulee_{month_index + 1}'].replace(0, np.nan)

In [8]:
all_data.shape

(3903, 72)

In [9]:
all_data.columns

Index(['Campagne', 'Region', 'Site', 'Famille', 'Variete', 'Num Parcelle',
       'CodeTracabilite', 'Date', 'Tonnage', 'is_train', 'Recolte',
       'irrigation_cumulee_1', 'irrigation_cumulee_2', 'irrigation_cumulee_3',
       'irrigation_cumulee_4', 'irrigation_cumulee_5', 'irrigation_cumulee_6',
       'irrigation_cumulee_7', 'irrigation_cumulee_8', 'irrigation_cumulee_9',
       'irrigation_cumulee_10', 'irrigation_cumulee_11',
       'irrigation_cumulee_12', 'N_cumulee_1', 'P_cumulee_1', 'K_cumulee_1',
       'N_cumulee_2', 'P_cumulee_2', 'K_cumulee_2', 'N_cumulee_3',
       'P_cumulee_3', 'K_cumulee_3', 'N_cumulee_4', 'P_cumulee_4',
       'K_cumulee_4', 'N_cumulee_5', 'P_cumulee_5', 'K_cumulee_5',
       'N_cumulee_6', 'P_cumulee_6', 'K_cumulee_6', 'N_cumulee_7',
       'P_cumulee_7', 'K_cumulee_7', 'N_cumulee_8', 'P_cumulee_8',
       'K_cumulee_8', 'N_cumulee_9', 'P_cumulee_9', 'K_cumulee_9',
       'N_cumulee_10', 'P_cumulee_10', 'K_cumulee_10', 'N_cumulee_11',
       'P_cum

In [10]:
# all_data['ndmi_missingness'] = all_data['ndmi'].apply(
#     lambda x: 0 if x != 0 else 1)

In [11]:
all_data['day'] = all_data['Date'].apply(lambda x: x.day)
all_data['month'] = all_data['Date'].apply(lambda x: x.month)
all_data['year'] = all_data['Date'].apply(lambda x: x.year)

Processing 

In [12]:
from sklearn.preprocessing import OneHotEncoder


cat_cols = ['Region', 'Site', 'Variete', 'Porte Greffe']
cat_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [13]:
from sklearn.impute import SimpleImputer


cols_to_impute_mean = ['Days to Arrachage', 'Tree Age'] + ['Sup Debut Camp']
mean_imputer = SimpleImputer(strategy='mean', add_indicator=True)

In [14]:
irrigation_columns = [f'irrigation_cumulee_{i}' for i in range(1, 13)]

# Calculate the median for these columns in each row
row_mean = all_data[irrigation_columns].mean(axis=1)

# Replace NaN values in each column with the row median
for col in irrigation_columns:
    all_data[col] = all_data[col].fillna(row_mean)

In [15]:
# Identify the columns to impute with mean
cols_to_impute_mean = ['Days to Arrachage', 'Tree Age', 'Sup Debut Camp'] + irrigation_columns

# Initialize the mean imputer with an option to add indicators for imputed values
mean_imputer = SimpleImputer(strategy='mean', add_indicator=True)

# Apply the imputer to the specified columns
imputed_data = mean_imputer.fit_transform(all_data[cols_to_impute_mean])

# Since the imputer also adds indicators for imputation, 
# update the column names to include indicator columns
imputed_cols = cols_to_impute_mean + [col + '_imputed' for col in cols_to_impute_mean]

# Update the dataframe with the imputed data
all_data[imputed_cols] = imputed_data

# Optionally, you may want to update the original columns with the imputed values
# and keep the indicators as separate columns
for i, col in enumerate(cols_to_impute_mean):
    all_data[col] = imputed_data[:, i]
    all_data[col + '_imputed'] = imputed_data[:, i + len(cols_to_impute_mean)]


In [16]:
all_data.columns

Index(['Campagne', 'Region', 'Site', 'Famille', 'Variete', 'Num Parcelle',
       'CodeTracabilite', 'Date', 'Tonnage', 'is_train', 'Recolte',
       'irrigation_cumulee_1', 'irrigation_cumulee_2', 'irrigation_cumulee_3',
       'irrigation_cumulee_4', 'irrigation_cumulee_5', 'irrigation_cumulee_6',
       'irrigation_cumulee_7', 'irrigation_cumulee_8', 'irrigation_cumulee_9',
       'irrigation_cumulee_10', 'irrigation_cumulee_11',
       'irrigation_cumulee_12', 'N_cumulee_1', 'P_cumulee_1', 'K_cumulee_1',
       'N_cumulee_2', 'P_cumulee_2', 'K_cumulee_2', 'N_cumulee_3',
       'P_cumulee_3', 'K_cumulee_3', 'N_cumulee_4', 'P_cumulee_4',
       'K_cumulee_4', 'N_cumulee_5', 'P_cumulee_5', 'K_cumulee_5',
       'N_cumulee_6', 'P_cumulee_6', 'K_cumulee_6', 'N_cumulee_7',
       'P_cumulee_7', 'K_cumulee_7', 'N_cumulee_8', 'P_cumulee_8',
       'K_cumulee_8', 'N_cumulee_9', 'P_cumulee_9', 'K_cumulee_9',
       'N_cumulee_10', 'P_cumulee_10', 'K_cumulee_10', 'N_cumulee_11',
       'P_cum

In [17]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming all_data is your original dataframe
# all_data = ...

# Define the categorical columns to be transformed
cat_cols = ['Region', 'Site', 'Variete', 'Porte Greffe']

# Create a transformer for categorical features
cat_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Create a column transformer to apply the transformation only to the specified categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols)
    ],
    remainder='passthrough'  # this will pass through other columns not listed in transformers
)

# Apply the transformations
all_data_transformed = preprocessor.fit_transform(all_data)

# The output will be a NumPy array. Convert it back to a dataframe if necessary
# Get feature names after one-hot encoding
new_cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
non_cat_cols = all_data.drop(columns=cat_cols).columns
all_columns = list(new_cat_features) + list(non_cat_cols)

# Create a new dataframe with transformed features
all_data = pd.DataFrame(all_data_transformed, columns=all_columns)

# Check the transformed dataframe
all_data.columns



Index(['Region_GHARB', 'Region_HAOUZ', 'Region_ORIENTAL', 'Region_SOUSS',
       'Region_TADLA', 'Site_Ait Ourir', 'Site_Ben Mansour', 'Site_Benguérir',
       'Site_Beni Mellal', 'Site_Bni Ayat',
       ...
       'irrigation_cumulee_3_imputed', 'irrigation_cumulee_4_imputed',
       'irrigation_cumulee_5_imputed', 'irrigation_cumulee_6_imputed',
       'irrigation_cumulee_7_imputed', 'irrigation_cumulee_8_imputed',
       'irrigation_cumulee_9_imputed', 'irrigation_cumulee_10_imputed',
       'irrigation_cumulee_11_imputed', 'irrigation_cumulee_12_imputed'],
      dtype='object', length=174)

In [18]:
cat_cols = ['Region', 'Site', 'Variete', 'Porte Greffe']

new_cat_features

num_cols = ['Sup Debut Camp', 'Tree Age', 'Days to Arrachage', 'Recolte']

imputation_indicator = [col + '_imputed' for col in cols_to_impute_mean]

monthly_params = ['N_cumulee', 'P_cumulee', 'K_cumulee', 'irrigation_cumulee']

monthly_variables = [
    f'{param}_{i}' for param in monthly_params for i in range(1, 13)]

time_variables = ['year', 'month']


# CHANGE HERE

predictors_names = monthly_variables + num_cols + imputation_indicator + list(new_cat_features)
target_name = ['Tonnage']

### Split and train model

In [19]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

In [19]:
# non_zero_na_sums = all_data[predictors_names].isna().sum()
# non_zero_na_sums = non_zero_na_sums[non_zero_na_sums != 0]
# non_zero_na_sums

In [20]:
train = all_data[all_data['is_train'] == True]
test = all_data[all_data['is_train'] == False]

In [21]:
train = train.sample(frac=1, random_state=42)  # random_state for reproducibility
X, y = train[predictors_names].values, train[target_name].values

Model Selection

In [22]:
# Define the models
models = {
    "Linear Regression": LinearRegression(),
    "XGBoost": XGBRegressor(),
    "Random Forest": RandomForestRegressor(),
    "ExtraTrees Regressor": ExtraTreesRegressor(),
    "LGBM Regressor": LGBMRegressor()
}
k = 4
kf = KFold(n_splits=k)
results = {}
for model_name, model in models.items():
    train_rmse_scores = []
    train_r2_scores = []
    test_rmse_scores = []
    test_r2_scores = []

    for train_index, test_index in kf.split(X):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]

        model.fit(x_train_fold, y_train_fold)
        y_pred_train = model.predict(x_train_fold)
        y_pred_test = model.predict(x_test_fold)

        r2_train = r2_score(y_train_fold, y_pred_train)
        rmse_train = mean_squared_error(
            y_train_fold, y_pred_train, squared=False)
        r2_test = r2_score(y_test_fold, y_pred_test)
        rmse_test = mean_squared_error(y_test_fold, y_pred_test, squared=False)

        train_rmse_scores.append(rmse_train)
        train_r2_scores.append(r2_train)
        test_rmse_scores.append(rmse_test)
        test_r2_scores.append(r2_test)

    avg_train_rmse = sum(train_rmse_scores) / k
    avg_train_r2 = sum(train_r2_scores) / k
    avg_test_rmse = sum(test_rmse_scores) / k
    avg_test_r2 = sum(test_r2_scores) / k
    # print("☢️", model_name, " avg_test_r2  : ", avg_test_r2)

    results[model_name] = {
        "train_rmse": avg_train_rmse,
        "train_r2": avg_train_r2,
        "test_rmse": avg_test_rmse,
        "test_r2": avg_test_r2
    }

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12634
[LightGBM] [Info] Number of data points in the train set: 2341, number of used features: 114
[LightGBM] [Info] Start training from score 20375.566419


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12609
[LightGBM] [Info] Number of data points in the train set: 2341, number of used features: 113
[LightGBM] [Info] Start training from score 21465.653424


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003778 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12596
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 113
[LightGBM] [Info] Start training from score 20376.407872


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12661
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 115
[LightGBM] [Info] Start training from score 20929.019016


plot results using plotly

In [23]:
import plotly.graph_objects as go
import numpy as np
import plotly.io as pio

data = results
models = list(data.keys())
train_rmse = [data[model]['train_rmse'] for model in models]
test_rmse = [data[model]['test_rmse'] for model in models]
train_r2 = [data[model]['train_r2'] for model in models]
test_r2 = [data[model]['test_r2'] for model in models]

train_rmse = [round(num, 2) for num in train_rmse]
test_rmse = [round(num, 2) for num in test_rmse]
train_r2 = [round(num, 2) for num in train_r2]
test_r2 = [round(num, 2) for num in test_r2]


standard_deviation = np.std(y)  # Calculate standard deviation using numpy
sample_size = len(y)  # Calculate sample size

standard_error = standard_deviation / np.sqrt(sample_size)

In [24]:
fig = go.Figure()

# Bar chart for RMSE
fig.add_trace(go.Bar(
    x=models,
    y=train_rmse,
    name='Train RMSE',
    marker_color='blue',
    text=train_rmse,  # Add this line to specify the text for each bar
    # 'auto' places the text inside the bars; you can also use 'outside' or 'inside'
    textposition='auto'
))

fig.add_trace(go.Bar(
    x=models,
    y=test_rmse,
    name='Test RMSE',
    marker_color='red',
    text=test_rmse,  # Add this line to specify the text for each bar
    # 'auto' places the text inside the bars; you can also use 'outside' or 'inside'
    textposition='auto'
))
# Update the layout
fig.update_layout(
    barmode='group',
    title='RMSE',
    xaxis_title='Models',
    yaxis_title='Value',
    legend_title='Data',
    width=600,
    # plot_bgcolor='rgba(0,0,0,0)',  # Set plot background color to transparent
    # paper_bgcolor='rgba(0,0,0,0)'
)

# # Line chart for std
# fig.add_trace(go.Scatter(
#     x=models,
#     y=[stdev for model in models],
#     mode='lines+markers',
#     name='Std',
#     line=dict(color='green', width=2)
# ))

# Line chart for std
fig.add_trace(go.Scatter(
    x=models,
    y=[standard_deviation for i in range(len(models))],
    mode='lines+markers',
    name='Std',
    line=dict(color='orange', width=2)
))

# # Line chart for std
# fig.add_trace(go.Scatter(
#     x=models,
#     y=[standard_error for i in range(len(models))],
#     mode='lines+markers',
#     name='SE',
#     line=dict(color='green', width=2)
# ))

# Show the plot
fig.show()

In [None]:
fig = go.Figure()

# Bar chart for R²
fig.add_trace(go.Bar(
    x=models,
    y=train_r2,
    name='Train R²',
    marker_color='blue',
    text=train_r2,  # Add this line to specify the text for each bar
    # 'auto' places the text inside the bars; you can also use 'outside' or 'inside'
    textposition='auto'
))

fig.add_trace(go.Bar(
    x=models,
    y=test_r2,
    name='Test R²',
    marker_color='red',
    text=test_r2,  # Add this line to specify the text for each bar
    # 'auto' places the text inside the bars; you can also use 'outside' or 'inside'
    textposition='auto'
))

# Update the layout
fig.update_layout(
    barmode='group',
    title='R²',
    xaxis_title='Models',
    yaxis_title='Value',
    legend_title='Data',
    width=600,
    # plot_bgcolor='rgba(0,0,0,0)',  # Set plot background color to transparent
    # paper_bgcolor='rgba(0,0,0,0)'
)

# Show the plot
fig.show()

hyperparams tuning

In [35]:
train.shape

(3122, 174)

In [25]:
from sklearn.model_selection import GridSearchCV

# Define the ExtraTrees Regressor model
selected_model = LGBMRegressor()

# Define the hyperparameter grid for tuning
complex_param_grid = {
    'num_leaves': [31, 63, 127],
    'max_depth': [-1, 5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500, 1000],
    'min_data_in_leaf': [20, 50, 100, 200],
    'bagging_fraction': [0.5, 0.7, 0.9, 1.0],
    'feature_fraction': [0.5, 0.7, 0.9, 1.0],
    'lambda_l1': [0, 0.01, 0.1, 1],
    'lambda_l2': [0, 0.01, 0.1, 1],
    'min_gain_to_split': [0, 0.1, 0.5, 1]
}

param_grid = {
    'num_leaves': [31, 63, 127],  # Controls complexity of the model
    'max_depth': [-1, 10, 15],    # Limits the depth of tree, -1 for no limit
    'learning_rate': [0.01, 0.05, 0.1],  # Impacts how quickly the model learns
    'n_estimators': [100, 200, 500],     # Number of boosting rounds
    'min_data_in_leaf': [20, 50, 100]    # Minimum samples in a leaf
}



# Perform randomized search cross-validation
grid_search = GridSearchCV(
    selected_model, param_grid, cv=4, scoring='r2', n_jobs=4)
grid_search.fit(X, y)

# Print the best hyperparameters and corresponding R² score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best R² Score: ", grid_search.best_score_)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13018
[LightGBM] [Info] Number of data points in the train set: 3122, number of used features: 119
[LightGBM] [Info] Start training from score 20786.633080
Best Hyperparameters:  {'learning_rate': 0.01, 'max_depth': -1, 'min_data_in_leaf': 20, 'n_estimators': 200, 'num_leaves': 63}
Best R² Score:  0.13636013080151144


In [28]:
best_params = {'learning_rate': 0.01, 'max_depth': -1, 'min_data_in_leaf': 20, 'n_estimators': 200, 'num_leaves': 63}
# Create Extra Trees model using the best parameters
best_model = LGBMRegressor(n_estimators=best_params['n_estimators'],
                                             max_depth=best_params['max_depth'],
                                             min_data_in_leaf=best_params['min_data_in_leaf'],
                                             num_leaves=best_params['num_leaves'],
                                             learning_rate=best_params['learning_rate'])

In [29]:
k = 4
kf = KFold(n_splits=k)
final_results = {}

train_rmse_scores = []
train_r2_scores = []
test_rmse_scores = []
test_r2_scores = []

for train_index, test_index in kf.split(X):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    best_model.fit(x_train_fold, y_train_fold)
    y_pred_train = best_model.predict(x_train_fold)
    y_pred_test = best_model.predict(x_test_fold)

    r2_train = r2_score(y_train_fold, y_pred_train)
    rmse_train = mean_squared_error(
        y_train_fold, y_pred_train, squared=False)
    r2_test = r2_score(y_test_fold, y_pred_test)
    rmse_test = mean_squared_error(y_test_fold, y_pred_test, squared=False)

    train_rmse_scores.append(rmse_train)
    train_r2_scores.append(r2_train)
    test_rmse_scores.append(rmse_test)
    test_r2_scores.append(r2_test)

avg_train_rmse = sum(train_rmse_scores) / k
avg_train_r2 = sum(train_r2_scores) / k
avg_test_rmse = sum(test_rmse_scores) / k
avg_test_r2 = sum(test_r2_scores) / k


final_results["metrics"] = {
    "RMSE train": avg_train_rmse,
    "RMSE test": avg_test_rmse,
    "R² train": avg_train_r2,
    "R² test": avg_test_r2
}

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003866 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12634
[LightGBM] [Info] Number of data points in the train set: 2341, number of used features: 114
[LightGBM] [Info] Start training from score 20375.566419


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12609
[LightGBM] [Info] Number of data points in the train set: 2341, number of used features: 113
[LightGBM] [Info] Start training from score 21465.653424


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12596
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 113
[LightGBM] [Info] Start training from score 20376.407872


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12661


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 115
[LightGBM] [Info] Start training from score 20929.019016


In [30]:
best_model.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13018
[LightGBM] [Info] Number of data points in the train set: 3122, number of used features: 119
[LightGBM] [Info] Start training from score 20786.633080


  y = column_or_1d(y, warn=True)


In [31]:
X_sub = test[predictors_names].values

y_sub = best_model.predict(X_sub)



export

In [32]:
y_sub.shape

(781,)

In [33]:
y_pred = np.array(y_sub)  # Convert to numpy array if not already
assert len(y_pred) == 781, "y_pred must have exactly 781 rows"

# Create a DataFrame. Adjust column names as per the competition's requirement.
# Usually, you will have an ID column and a prediction column.
submission_df = pd.DataFrame({
    'Id': range(1, 782),  # Example: creating a sequence of IDs from 1 to 781
    'Tonnage': y_pred
})

# Export to CSV
csv_file = "submission.csv"
submission_df.to_csv(csv_file, index=False)