In [3]:
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, auc, balanced_accuracy_score, accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [4]:
model_input = pd.read_csv("data/sp_training_data.csv")
model_input.rename(columns={'nmin_90' : 'leach'}, inplace = True)

# Separate columns
crop_column = model_input[['crop']]
doy_column = model_input[['doy']]

# Nmin conversion to binary
threshold = model_input['leach'].mean()
nmin_column = (model_input['leach'] > threshold).astype(int)

# Transform DOY into two dimensions using sine and cosine
doy_column['doy_sin'] = np.sin(2 * np.pi * doy_column['doy'] / 365)
doy_column['doy_cos'] = np.cos(2 * np.pi * doy_column['doy'] / 365)

# One-hot encode the 'crop' column
encoder = OneHotEncoder(sparse=False)
crop_encoded = encoder.fit_transform(crop_column)
crop_df = pd.DataFrame(crop_encoded, columns=encoder.get_feature_names_out(['crop']))

# Drop 'crop' and 'doy' columns and the specified 'field' from the original DataFrame
fields = model_input.drop(['crop', 'doy', 'leach'], axis=1)

# Scale the remaining numerical features
scaler = StandardScaler(with_mean=False)
scaled_fields = scaler.fit_transform(fields)
scaled_fields_df = pd.DataFrame(scaled_fields, columns=fields.columns)

# Concatenate the transformed 'crop', 'doy', and scaled numerical features
final_df = pd.concat([doy_column[['doy_sin', 'doy_cos']], crop_df, scaled_fields_df, nmin_column], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doy_column['doy_sin'] = np.sin(2 * np.pi * doy_column['doy'] / 365)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doy_column['doy_cos'] = np.cos(2 * np.pi * doy_column['doy'] / 365)


In [5]:
# drops bands not in 4band spectrum and indices from them
fourband = final_df.drop(['band_1', 'band_3', 'band_5', 'band_7', 'NDRE', 'MCARI', 'LCCI'], axis = 1)

# drops rows with NA present (only occurs in 4band rows for extra bands)
NAmask  = final_df.apply(lambda x: x.notna().all(), axis=1)
eightband = final_df[NAmask]

# drops crop columns and bands not in 4band spectrum and indices from them 
nocrop = pd.concat([doy_column[['doy_sin', 'doy_cos']], scaled_fields_df, nmin_column], axis=1)
nocrop_four = nocrop.drop(['band_1', 'band_3', 'band_5', 'band_7', 'NDRE', 'MCARI', 'LCCI'], axis = 1)

# drops crop columns and rows with NA present (only occurs in 4band rows for extra bands))
nocrop_eight = nocrop[NAmask]

dfs = [
    [fourband, 'fourband'], 
    [eightband, 'eightband']
    # [nocrop_four, 'nocrop four'],
    # [nocrop_eight, 'nocrop eight']
    ]

yvar = 'leach'

In [7]:
models = [
    ("Logistic Regression", LogisticRegression(max_iter=10000)), # increased max_iter for convergence
    ("SVM", SVC(probability=True)), # set probability=True to ensure you can use methods like predict_proba if needed
    ("KNN", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("AdaBoost", AdaBoostClassifier(random_state=42))
]

for model_name, model_instance in models:
    print(f"\nModel: {model_name}\n{'-'*40}")
    for data, name in dfs:
        x = data.drop(yvar, axis=1)
        y = data[[yvar]]
        cn = data.columns #column names
        n = data.shape[0] #nrows
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(x, y.values.ravel(), test_size=0.2, random_state=42)
        # Train the model
        model_instance.fit(X_train, y_train)
        # Predict on the test set
        y_pred = model_instance.predict(X_test)
        # Compute the confusion matrix
        # conf_mat = confusion_matrix(y_test, y_pred)

        # Generate the classification report
        # class_report = classification_report(y_test, y_pred)
        balacc = balanced_accuracy_score(y_test, y_pred)

        print("Balanced Accuracy:", name)
        print(balacc)

        # # Visualize the confusion matrix (Optional)
        # plt.figure(figsize=(8, 6))
        # sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues")
        # plt.xlabel('Predicted')
        # plt.ylabel('Actual')
        # plt.title('Confusion Matrix')
        # plt.show()


Model: Logistic Regression
----------------------------------------
Balanced Accuracy: fourband
0.625
Balanced Accuracy: eightband
0.6136363636363636

Model: SVM
----------------------------------------
Balanced Accuracy: fourband
0.5
Balanced Accuracy: eightband
0.5

Model: KNN
----------------------------------------
Balanced Accuracy: fourband
0.6875
Balanced Accuracy: eightband
0.75

Model: Decision Tree
----------------------------------------
Balanced Accuracy: fourband
0.7083333333333333
Balanced Accuracy: eightband
0.3181818181818182

Model: Random Forest
----------------------------------------
Balanced Accuracy: fourband
0.6458333333333334
Balanced Accuracy: eightband
0.5

Model: Gradient Boosting
----------------------------------------
Balanced Accuracy: fourband
0.6458333333333334
Balanced Accuracy: eightband
0.36363636363636365

Model: AdaBoost
----------------------------------------
Balanced Accuracy: fourband
0.6875
Balanced Accuracy: eightband
0.45454545454545453
