In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

train_path = 'train_new.csv'
train_data = pd.read_csv(train_path)

In [2]:
from autosklearn.regression import AutoSklearnRegressor

In [3]:
print(train_data.head())

                ID District      Block  CultLand  CropCultLand  \
0  ID_GTFAC7PEVWQ9  Nalanda  Noorsarai        45            40   
1  ID_TK40ARLSPOKS  Nalanda     Rajgir        26            26   
2  ID_1FJY2CRIMLZZ     Gaya      Gurua        10            10   
3  ID_I3IPXS4DB7NE     Gaya      Gurua        15            15   
4  ID_4T8YQWXWHB4A  Nalanda  Noorsarai        60            60   

                               LandPreparationMethod CropTillageDate  \
0               TractorPlough FourWheelTracRotavator      2022-07-20   
1  WetTillagePuddling TractorPlough FourWheelTrac...      2022-07-18   
2               TractorPlough FourWheelTracRotavator      2022-06-30   
3               TractorPlough FourWheelTracRotavator      2022-06-16   
4                   TractorPlough WetTillagePuddling      2022-07-19   

   CropTillageDepth         CropEstMethod RcNursEstDate  ... Harv_method  \
0                 5  Manual_PuddledRandom    2022-06-27  ...     machine   
1                 

A very simple preprocessing method to test if AutoML tools are able to figure out the best model with a non-optimal preprocessing.

In [4]:
# Convert datetime columns to datetime format and extract year, month, day, try to use this simple preprocessing method first
datetime_cols = ['CropTillageDate', 'RcNursEstDate', 'Harv_date', 'Threshing_date']
for col in datetime_cols:
    train_data[col] = pd.to_datetime(train_data[col])
    train_data[col + '_year'] = train_data[col].dt.year
    train_data[col + '_month'] = train_data[col].dt.month
    train_data[col + '_day'] = train_data[col].dt.day

# Drop original datetime columns, only use new columns
train_data.drop(columns=datetime_cols, inplace=True)

# Encode categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le


# Preparing the data for modeling
# Training Features
X = train_data.drop(['ID', 'Yield'], axis=1) 
# Target Feature
y = train_data['Yield']  

# Splitting the dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Fit AutoML pipeline on the train datasets.

In [5]:
automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=29, n_jobs=-1, memory_limit=None)
automl.fit(X_train, y_train)

AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                     memory_limit=None, n_jobs=-1, per_run_time_limit=29,
                     time_left_for_this_task=60)

Predict on the test dataset.

In [6]:
y_pred = automl.predict(X_test)
# get the Score of the model
# Calculate R2 score as a simple performance metric
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

r2

show_modes_str=automl.show_models()
print(show_modes_str)

{2: {'model_id': 2, 'rank': 1, 'cost': 0.0964945622365625, 'ensemble_weight': 0.48000000000000004, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f3a92b04b50>, 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f3a4acaefd0>, 'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7f3a9055e640>, 'sklearn_regressor': RandomForestRegressor(max_features=1.0, n_estimators=512, n_jobs=1,
                      random_state=1, warm_start=True)}, 5: {'model_id': 5, 'rank': 2, 'cost': 0.1458316841761078, 'ensemble_weight': 0.08000000000000002, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f3a92d4abe0>, 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f3976cfb040>, 'regressor': <autosklearn.pipeline.components.r

Show statistical analysis results.

In [7]:
sprint_statistics_str = automl.sprint_statistics()
print(sprint_statistics_str)

auto-sklearn results:
  Dataset name: 92b9f316-d8a7-11ee-8e4b-df51cd2d4e54
  Metric: r2
  Best validation score: 0.903505
  Number of target algorithm runs: 42
  Number of successful target algorithm runs: 18
  Number of crashed target algorithm runs: 4
  Number of target algorithms that exceeded the time limit: 20
  Number of target algorithms that exceeded the memory limit: 0



However, the mean_sqaured_error still shows a huge gap with the normal methods without AutoML

In [12]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(mse)

16096.906203949096
