In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score

In [11]:

# Reading data
df = pd.read_csv("assets/train.csv", index_col=0)

# Dropping unnecessary columns
df.drop(['Find Distance from Main Vulcano (km)', 
         'Atmospheric Pressure at Harvest (Pa)', 
         'Magnetic orientation (degree)', 
         'Soil pH where Grown'], axis=1, inplace=True)

# Handling missing values using KNN imputation

df_edible = df['Edible']
df_rest = df.drop('Edible', axis=1)

imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df_rest), columns=df_rest.columns)

#adding back the target variable
df_imputed['Edible'] = df_edible

# Removing outliers using z-score
z_scores = (df_imputed - df_imputed.mean()) / df_imputed.std()
df_clean = df_imputed[(np.abs(z_scores) < 5).all(axis=1)]


In [12]:
# Data splitting
X = df_clean.drop(columns="Edible")
y = df_clean["Edible"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
# Model training and hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best model from grid search
best_rf = grid_search.best_estimator_

# Model evaluation
y_pred_train = best_rf.predict(X_train_scaled)
y_pred_test = best_rf.predict(X_test_scaled)
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Train Accuracy: 1.0
Test Accuracy: 0.9020408163265307


In [14]:
# Kaggle submission
df_test = pd.read_csv("assets/test.csv", index_col=0)
df_test.drop(['Find Distance from Main Vulcano (km)', 
              'Atmospheric Pressure at Harvest (Pa)', 
              'Magnetic orientation (degree)', 
              'Soil pH where Grown'], axis=1, inplace=True)

# Removing outliers using z-score
df_test_imputed = pd.DataFrame(imputer.transform(df_test), columns=df_test.columns)

z_scores = (df_test_imputed - df_test_imputed.mean()) / df_test_imputed.std()
df_test_clean = df_test_imputed[(np.abs(z_scores) < 5).all(axis=1)]

df_test_scaled = scaler.transform(df_test)
df_test_pred = best_rf.predict(df_test_scaled)
df_results = pd.DataFrame(data=df_test_pred.astype(int), columns=["Edible"], index=df_test.index)
df_results.index.names = ["index"]
df_results.to_csv('assets/results.csv')