In [9]:
import pandas as pd  # type: ignore
import sklearn # type: ignore

# Model Training

In [10]:
df = pd.read_csv("data/plant_growth_data.csv")
df.head()

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Growth_Milestone
0,loam,5.192294,bi-weekly,chemical,31.719602,61.591861,0
1,sandy,4.033133,weekly,organic,28.919484,52.422276,1
2,loam,8.892769,bi-weekly,none,23.179059,44.660539,0
3,loam,8.241144,bi-weekly,none,18.465886,46.433227,0
4,sandy,8.374043,bi-weekly,organic,18.128741,63.625923,0


In [11]:
# independent and dependent features
X = df.drop('Growth_Milestone',axis =1)
y = df["Growth_Milestone"]

In [12]:
y

0      0
1      1
2      0
3      0
4      0
      ..
188    0
189    1
190    0
191    1
192    0
Name: Growth_Milestone, Length: 193, dtype: int64

In [13]:
X

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity
0,loam,5.192294,bi-weekly,chemical,31.719602,61.591861
1,sandy,4.033133,weekly,organic,28.919484,52.422276
2,loam,8.892769,bi-weekly,none,23.179059,44.660539
3,loam,8.241144,bi-weekly,none,18.465886,46.433227
4,sandy,8.374043,bi-weekly,organic,18.128741,63.625923
...,...,...,...,...,...,...
188,sandy,5.652000,daily,none,28.000000,70.200000
189,clay,7.528000,weekly,chemical,30.500000,60.100000
190,loam,4.934000,bi-weekly,none,24.500000,61.700000
191,sandy,8.273000,daily,organic,27.900000,69.500000


In [14]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [15]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder # handling feature scaling and nominal encoding
from sklearn.impute import SimpleImputer # handling missing value

from sklearn.pipeline import Pipeline  # type: ignore
from sklearn.compose import ColumnTransformer

In [16]:
num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('NominalEncoding',OneHotEncoder())
    ]
)


preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])


In [17]:
## Train test split

from sklearn.model_selection import train_test_split # type: ignore
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [18]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [19]:
X_train.head()

Unnamed: 0,num_pipeline__Sunlight_Hours,num_pipeline__Temperature,num_pipeline__Humidity,cat_pipeline__Soil_Type_clay,cat_pipeline__Soil_Type_loam,cat_pipeline__Soil_Type_sandy,cat_pipeline__Water_Frequency_bi-weekly,cat_pipeline__Water_Frequency_daily,cat_pipeline__Water_Frequency_weekly,cat_pipeline__Fertilizer_Type_chemical,cat_pipeline__Fertilizer_Type_none,cat_pipeline__Fertilizer_Type_organic
0,-0.016077,0.176913,0.271632,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-1.600133,-0.405307,0.264233,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.384112,1.208524,0.263707,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.267096,-1.880536,-2.148042,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.184131,1.074797,-0.40199,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


from sklearn.metrics import accuracy_score,confusion_matrix


In [21]:
model = LogisticRegression()
model.fit(X_train,y_train)


In [22]:
model.score(X_train,y_train)

0.6666666666666666

In [23]:
import numpy as np
def evaluate_model(true,predicted):
    acc= accuracy_score(true,predicted)
    co_max = confusion_matrix(true,predicted)

    return acc,co_max


In [36]:
# Train multiple models
models = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree Regressor' : DecisionTreeClassifier(),
    "Naib Bias" : GaussianNB(),
    'SVC' : SVC(),
    'KNN': KNeighborsClassifier(),
    'Random Forest Classifier' : RandomForestClassifier(),
    'Adaboosting' : AdaBoostClassifier(),
    'GradientBoosting' : GradientBoostingClassifier(),
    "Xgboosting" : XGBClassifier()

}

trained_model_dict = {}
model_list = []
accuracy_list = []

for i in range (len(list(models))):
    model = list(models.values())[i]
    
    model.fit(X_train,y_train)
    # make predictions
    y_pred = model.predict(X_test)

    accuracy , matrix=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    trained_model_dict[list(models.keys())[i]] = accuracy 

    print('Model Training Performance')
    print('accuracy score',accuracy)
    print('confussion matrix',matrix)

    accuracy_list.append(accuracy)

    print('='*35)
    print('\n')
    


Logistic Regression
Model Training Performance
accuracy score 0.603448275862069
confussion matrix [[18 16]
 [ 7 17]]


Decision Tree Regressor
Model Training Performance
accuracy score 0.5172413793103449
confussion matrix [[21 13]
 [15  9]]


Naib Bias
Model Training Performance
accuracy score 0.7068965517241379
confussion matrix [[21 13]
 [ 4 20]]


SVC
Model Training Performance
accuracy score 0.5517241379310345
confussion matrix [[17 17]
 [ 9 15]]


KNN
Model Training Performance
accuracy score 0.5517241379310345
confussion matrix [[16 18]
 [ 8 16]]


Random Forest Classifier
Model Training Performance
accuracy score 0.5344827586206896
confussion matrix [[17 17]
 [10 14]]


Adaboosting
Model Training Performance
accuracy score 0.41379310344827586
confussion matrix [[14 20]
 [14 10]]


GradientBoosting
Model Training Performance
accuracy score 0.4827586206896552
confussion matrix [[13 21]
 [ 9 15]]


Xgboosting
Model Training Performance
accuracy score 0.5172413793103449
confussion m

In [37]:
trained_model_dict

{'Logistic Regression': 0.603448275862069,
 'Decision Tree Regressor': 0.5172413793103449,
 'Naib Bias': 0.7068965517241379,
 'SVC': 0.5517241379310345,
 'KNN': 0.5517241379310345,
 'Random Forest Classifier': 0.5344827586206896,
 'Adaboosting': 0.41379310344827586,
 'GradientBoosting': 0.4827586206896552,
 'Xgboosting': 0.5172413793103449}