In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m


In [3]:
from sklearn.metrics import r2_score, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

In [4]:
import lightgbm as lgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor

In [5]:
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop

In [6]:
data = pd.read_csv('/content/preprocessed_700.csv')

In [7]:
data.head()

Unnamed: 0,Name,Fertility,Photoperiod,N-P-K Ratio,Temperature,Rainfall,pH,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Category_pH,Soil_Type,Season
0,Apple,High,Day Neutral,10:10:10,21.063204,1932.402709,6.567764,12.716549,860.189066,92.677579,89.266502,40.330099,180.63574,12.847482,low_acidic,Sandy Loam,Fall
1,Apple,High,Day Neutral,10:10:10,19.511305,1589.295994,6.784538,13.54456,797.66076,92.293923,92.80815,37.131922,179.042979,13.894292,neutral,Sandy Loam,Fall
2,Apple,High,Day Neutral,10:10:10,23.045662,1269.789133,6.619155,12.330668,910.861369,91.798926,84.24859,38.693498,163.604138,13.372203,low_acidic,Sandy Loam,Fall
3,Apple,High,Short Day Period,10:10:10,17.986016,1944.180144,6.638623,12.96534,922.725203,92.74271,84.780429,43.950592,173.881606,11.801568,neutral,Sandy Loam,Spring
4,Apple,High,Day Neutral,10:10:10,23.775354,1790.352815,6.654898,12.895817,821.411003,90.98153,91.197126,45.56447,174.324935,10.660521,neutral,Sandy Loam,Fall


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23800 entries, 0 to 23799
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             23800 non-null  object 
 1   Fertility        23800 non-null  object 
 2   Photoperiod      23800 non-null  object 
 3   N-P-K Ratio      23800 non-null  object 
 4   Temperature      23800 non-null  float64
 5   Rainfall         23800 non-null  float64
 6   pH               23800 non-null  float64
 7   Light_Hours      23800 non-null  float64
 8   Light_Intensity  23800 non-null  float64
 9   Rh               23800 non-null  float64
 10  Nitrogen         23800 non-null  float64
 11  Phosphorus       23800 non-null  float64
 12  Potassium        23800 non-null  float64
 13  Yield            23800 non-null  float64
 14  Category_pH      23800 non-null  object 
 15  Soil_Type        23800 non-null  object 
 16  Season           23800 non-null  object 
dtypes: float64(1

In [10]:
num_cols = [col for col in data.columns if data[col].dtype == 'float64']
cat_cols = [col for col in data.columns if data[col].dtype == 'object']


print('Num Cols: ', len(num_cols))
print('Cat Cols: ', len(cat_cols))

Num Cols:  10
Cat Cols:  7


In [15]:
print(len(data.columns))

17


In [16]:
n_p_k_split = data['N-P-K Ratio'].str.split(':', expand=True)
n_p_k_split = n_p_k_split.apply(pd.to_numeric)
data[['Nitrogen_npk', 'Phosphorus_npk', 'Potassium_npk']] = n_p_k_split

data.drop(['N-P-K Ratio'], axis=1, inplace=True)

In [17]:
data.head()

Unnamed: 0,Name,Fertility,Photoperiod,Temperature,Rainfall,pH,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Category_pH,Soil_Type,Season,Nitrogen_npk,Phosphorus_npk,Potassium_npk
0,Apple,High,Day Neutral,21.063204,1932.402709,6.567764,12.716549,860.189066,92.677579,89.266502,40.330099,180.63574,12.847482,low_acidic,Sandy Loam,Fall,10,10.0,10.0
1,Apple,High,Day Neutral,19.511305,1589.295994,6.784538,13.54456,797.66076,92.293923,92.80815,37.131922,179.042979,13.894292,neutral,Sandy Loam,Fall,10,10.0,10.0
2,Apple,High,Day Neutral,23.045662,1269.789133,6.619155,12.330668,910.861369,91.798926,84.24859,38.693498,163.604138,13.372203,low_acidic,Sandy Loam,Fall,10,10.0,10.0
3,Apple,High,Short Day Period,17.986016,1944.180144,6.638623,12.96534,922.725203,92.74271,84.780429,43.950592,173.881606,11.801568,neutral,Sandy Loam,Spring,10,10.0,10.0
4,Apple,High,Day Neutral,23.775354,1790.352815,6.654898,12.895817,821.411003,90.98153,91.197126,45.56447,174.324935,10.660521,neutral,Sandy Loam,Fall,10,10.0,10.0


In [18]:
label = data['Yield']
data_replaced = data.drop('Yield', axis = 1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23800 entries, 0 to 23799
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             23800 non-null  object 
 1   Fertility        23800 non-null  object 
 2   Photoperiod      23800 non-null  object 
 3   Temperature      23800 non-null  float64
 4   Rainfall         23800 non-null  float64
 5   pH               23800 non-null  float64
 6   Light_Hours      23800 non-null  float64
 7   Light_Intensity  23800 non-null  float64
 8   Rh               23800 non-null  float64
 9   Nitrogen         23800 non-null  float64
 10  Phosphorus       23800 non-null  float64
 11  Potassium        23800 non-null  float64
 12  Yield            23800 non-null  float64
 13  Category_pH      23800 non-null  object 
 14  Soil_Type        23800 non-null  object 
 15  Season           23800 non-null  object 
 16  Nitrogen_npk     23800 non-null  int64  
 17  Phosphorus_n

In [21]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [22]:
cat_cols = []
num_cols = []

for col in data_replaced.columns:
    if data_replaced[col].dtype == 'object':
        cat_cols.append(col)
    else:
        num_cols.append(col)

In [23]:
cat_cols

['Name', 'Fertility', 'Photoperiod', 'Category_pH', 'Soil_Type', 'Season']

In [24]:
num_cols

['Temperature',
 'Rainfall',
 'pH',
 'Light_Hours',
 'Light_Intensity',
 'Rh',
 'Nitrogen',
 'Phosphorus',
 'Potassium',
 'Nitrogen_npk',
 'Phosphorus_npk',
 'Potassium_npk']

In [25]:
for col in cat_cols:
    le = LabelEncoder()
    data_replaced[col] = le.fit_transform(data_replaced[col])

In [27]:

def transform(df):
    features = df.columns.tolist()
    sorted_new =  np.sort(df.values, axis=1)
    df['mean_features'] = 0.1 * df[features].mean(axis=1)
    df['std_features'] = df[features].std(axis=1)
    df['max_features'] = df[features].max(axis=1)
    df['min_features'] = df[features].min(axis=1)
    df['median_features'] = 0.1 * df[features].median(axis=1)
    df['sum_features'] = 0.1 * df[features].sum(axis=1)
    df1 = pd.concat([df, pd.DataFrame(sorted_new, index=df.index)], axis=1)
    df1 = df1.drop(features, axis=1)
    df1.columns = df1.columns.astype('str')
    return df

data_new = transform(data_replaced)

In [28]:
X = data_new
y = label

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

In [29]:
xgb_model = XGBRegressor()
cat_model = CatBoostRegressor(verbose=False)
lgb_model = lgb.LGBMRegressor(verbose=-1)

meta_model = LinearRegression()

stacked = StackingRegressor(estimators=[
    ('xgb', xgb_model),
    ('cat', cat_model),
    ('lgb', lgb_model)],
    final_estimator=meta_model)

stacked.fit(X_train, y_train)
preds = stacked.predict(X_val)

print("Stacked R2:", r2_score(y_val, preds))

Stacked R2: 0.9945137918123756


In [30]:
xgb_model = XGBRegressor(n_estimators = 8000, max_depth = 9,
                   learning_rate =  0.05371502553155743,
                   subsample = 0.85715838272758116,
                   colsample_bytree = 0.892390046436166,
                   gamma = 0.012984290742285246,
                   min_child_weight = 3,
                   random_state = 0)

cat_model = CatBoostRegressor(n_estimators = 8000, learning_rate = 0.011277016304363601,
                       depth = 8, min_data_in_leaf = 98, random_state = 0, devices='0', verbose=False)

lgb_model = lgb.LGBMRegressor(boosting_type = 'gbdt', n_estimators = 2000,
                    learning_rate =  0.012, num_leaves = 250,
                    subsample_for_bin = 165700, min_child_samples = 114,
                    reg_alpha = 2.075e-06, reg_lambda = 3.839e-07,
                    colsample_bytree = 0.9634, subsample = 0.9592,
                    max_depth = 10, random_state = 0, verbose = 0)

xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_val)
cat_pred = cat_model.predict(X_val)
lgb_pred = lgb_model.predict(X_val)

y_pred = 0.33*xgb_pred + 0.33*cat_pred + 0.34*lgb_pred
print("Stacked R2:", r2_score(y_val, y_pred))

Stacked R2: 0.9944866136289877
