In [1]:
import os

# Get the current working directory
cwd = os.getcwd()

print(cwd)

import pandas as pd
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer

#I have downloaded the zip from kaggle and used the docker desktop to insert the file into tmp folder in a folder called archive
path2csv = Path("/tmp/archive/")
csvlist = path2csv.glob("*.csv")
csvs = [pd.read_csv(g) for g in csvlist ]

#
print(csvs)

/home/jovyan
[    Soil_Type  Sunlight_Hours Water_Frequency Fertilizer_Type  Temperature  \
0        loam        5.192294       bi-weekly        chemical    31.719602   
1       sandy        4.033133          weekly         organic    28.919484   
2        loam        8.892769       bi-weekly            none    23.179059   
3        loam        8.241144       bi-weekly            none    18.465886   
4       sandy        8.374043       bi-weekly         organic    18.128741   
..        ...             ...             ...             ...          ...   
188     sandy        5.652000           daily            none    28.000000   
189      clay        7.528000          weekly        chemical    30.500000   
190      loam        4.934000       bi-weekly            none    24.500000   
191     sandy        8.273000           daily         organic    27.900000   
192      clay        6.732000          weekly            none    21.700000   

      Humidity  Growth_Milestone  
0    61.591861

In [2]:
#refactor to pandas object to follow example code 
df = pd.read_csv('/tmp/archive/plant_growth_data.csv');
df.head()

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Growth_Milestone
0,loam,5.192294,bi-weekly,chemical,31.719602,61.591861,0
1,sandy,4.033133,weekly,organic,28.919484,52.422276,1
2,loam,8.892769,bi-weekly,none,23.179059,44.660539,0
3,loam,8.241144,bi-weekly,none,18.465886,46.433227,0
4,sandy,8.374043,bi-weekly,organic,18.128741,63.625923,0


In [3]:
# Preprocces columns to scikit learn structure 
df.isnull().sum()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report , confusion_matrix, precision_score,recall_score, f1_score

num_cols = df.select_dtypes(include = ['int64','float64']).columns
cat_cols = df.select_dtypes(include = ['object']).columns



label_encoder = LabelEncoder()

for col in cat_cols:
    df[col] = label_encoder.fit_transform(df[col])

x= df.drop(['Growth_Milestone'],axis=1)
y= df['Growth_Milestone']

# Convert DataFrame to a list of dictionaries
df_dict = df.to_dict(orient='records')



# Initialize DictVectorizer
vec = DictVectorizer(sparse=False)

# Transform the data into a matrix
X = vec.fit_transform(df_dict)

# Define the target variable
y = df['Growth_Milestone']

print(X)

[[ 0.          0.         61.59186061 ...  5.19229409 31.71960241
   0.        ]
 [ 2.          1.         52.4222761  ...  4.0331327  28.91948412
   2.        ]
 [ 1.          0.         44.66053858 ...  8.89276857 23.17905889
   0.        ]
 ...
 [ 1.          0.         61.7        ...  4.934      24.5
   0.        ]
 [ 2.          1.         69.5        ...  8.273      27.9
   1.        ]
 [ 1.          0.         56.9        ...  6.732      21.7
   2.        ]]


In [4]:
#


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
rfc = RandomForestClassifier(n_estimators = 300 , random_state=42)
rfc.fit(X_train.values,y_train)

rfc_pred = rfc.predict(X_test)
print(f'Accuracy Score: {accuracy_score(rfc_pred,y_test)}')
precision = precision_score(rfc_pred, y_test)
recall = recall_score(rfc_pred,y_test)
f1 = f1_score(y_test,rfc_pred)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print(X_test.iloc[0])
print(rfc)

Accuracy Score: 0.5897435897435898
Precision: 0.6363636363636364
Recall: 0.6363636363636364
F1-score: 0.6363636363636364
Soil_Type           1.000000
Sunlight_Hours      8.822032
Water_Frequency     0.000000
Fertilizer_Type     2.000000
Temperature        26.509484
Humidity           35.940896
Name: 45, dtype: float64
RandomForestClassifier(n_estimators=300, random_state=42)




In [6]:
# Define function to predict growth milestone for a given record index
def predict_growth_milestone(record_index: int):
    # Ensure the index is within the range of the dataset
    if record_index < 0 or record_index >= len(df):
        return "Index out of range"
    
    # Extract the record (including preprocessing steps)
    record = df.drop(['Growth_Milestone'], axis=1).iloc[record_index].values.reshape(1, -1)

    # Standardize the input record
    record_scaled = scaler.transform(record)

    print(record_scaled)

    # Predict using the trained Random Forest model
    prediction = rfc.predict(record_scaled)

    # Return the prediction (0 or 1) based on the growth milestone
    if prediction == 1:
        return "Milestone Reached"
    else:
        return "Milestone Not Reached"


In [7]:
record_index = 20  # You can change this to any index within the dataset
result = predict_growth_milestone(record_index)
print(f"Record {record_index}: {result}")

[[ 1.24286687  0.94650076 -1.32287566  0.07624655 -1.76587469 -1.85786206]]
Record 20: Milestone Reached




In [8]:
#save model
import pickle
output_file = f'model_GrowthPredict.bin'

In [9]:
output_file

'model_GrowthPredict.bin'

In [10]:
with open(output_file, 'wb') as f_out:
    pickle.dump((vec, rfc),f_out)