In [None]:
# Initialization code that runs before all other cells
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve,auc,confusion_matrix,precision_score,recall_score, f1_score
from preprocessing import preprocess,get_labels
from extraction import feature_extraction
import pandas as pd
import numpy as np
import marimo as mo
import seaborn as sns
import matplotlib.pyplot as plt

# Load and Extract Features from Dataset

In [None]:
df = pd.read_csv("../../..data/train_data.csv",index_col=0)
X = df.pipe(preprocess,attr= [])
X

In [None]:
y = get_labels(df)
y

# Create Model

In [None]:
svm = SVC(probability=True)
svm

# Hyper-Parameter Optimization

In [None]:
grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale','auto',0.01,0.1],
    'kernel': ['rbf','poly','sigmoid']
}

In [None]:
search = GridSearchCV(
    estimator=svm,
    param_grid=grid,
    cv = 5,
    scoring='accuracy',
    n_jobs=3
)

In [None]:
search.fit(X=X, y=y.values.flatten())

In [None]:
mo.md(f"""*Best training score for SVM is {search.best_score_:.2f}*""")

In [None]:
test = pd.read_csv("data/test_data.csv",index_col=0)
X_test = test.pipe(preprocess, attr = [])
y_test = test.pipe(get_labels)

In [None]:
optimal = search.best_estimator_
y_score = optimal.predict_proba(X_test)
y_pred = y_score.argmax(axis = 1)
cm = confusion_matrix(
    y_test['Plant_Health_Status'].astype('category').cat.codes,
    y_pred
)

# SVM Performance
SVM struggled to differentiate the level of stress of plants rather than the binary case of whether a plant was stressed or not

In [None]:
sns.heatmap(
    cm,annot=True,cmap='mako',
    xticklabels=y["Plant_Health_Status"].unique(),
    yticklabels=y["Plant_Health_Status"].unique()
)
plt.xlabel("Predicted")
plt.ylabel("True")

In [None]:
o = OneHotEncoder(sparse_output=False)
y_classes = o.fit_transform(y_test)

# ROC Curves Per-Class
Both the Healthy and High Stress Classes Have been predicted well while the
Moderate Stress class has significantly lower performance overall

In [None]:
tprs = []
fprs = []
names = [f.split("_")[-1] for f in o.get_feature_names_out()]
plt.figure(figsize=(8,6))
for i in range(y_score.shape[1]):
    tpr,fpr, _ =roc_curve(y_classes[:,i],y_score[:,i])
    a = auc(tpr,fpr)
    tprs.append(tpr)
    fprs.append(fpr)
    plt.plot(tpr,fpr,label = f"{names[i]} (AUC={a:.2f})")
plt.legend(bbox_to_anchor = (1.08,1.1), ncol = 3,frameon = False)
plt.grid(True)
plt.show()

# Recall Precision and F1

In [None]:
precision = precision_score(y_pred,y_test['Plant_Health_Status'].astype('category').cat.codes,average='micro')
recall = recall_score(y_pred,y_test['Plant_Health_Status'].astype('category').cat.codes,average='micro')
f1 = f1_score(y_pred,y_test['Plant_Health_Status'].astype('category').cat.codes,average='micro')

In [None]:
mo.md(
    f"""
Average Precision: {precision:.2f}\n
Average Recall: {recall:.2f}\n
Average F1: {f1:.2f}
"""
)

In [None]:
def svm_train(X,y):
    """
        Trains SVM model on dataset
        returns the predicted labels, the probability of each class
        and the optimal model
    """
    svm = SVC(probability=True)
    grid = {
        'C': [0.1, 1, 10],
        'gamma': ['scale','auto',0.01,0.1],
        'kernel': ['rbf','poly','sigmoid']
    }
    search = GridSearchCV(
        estimator=svm,
        param_grid=grid,
        cv = 5,
        scoring='accuracy',
        n_jobs=1
    )
    optimal = search.best_estimator_
    y_score = optimal.predict_proba(X)
    y_pred = y_score.argmax(axis = 1)
    return y_pred, y_score, optimal