In [8]:
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report, f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
model_input = pd.read_csv("data/sp_training_data.csv")
model_input.rename(columns={'nmin_90' : 'leach'}, inplace = True)

# Separate columns
crop_column = model_input[['crop']]
doy_column = model_input[['doy']]

# Nmin conversion to binary
threshold = model_input['leach'].mean()
nmin_column = (model_input['leach'] > threshold).astype(int)

# Transform DOY into two dimensions using sine and cosine
doy_column['doy_sin'] = np.sin(2 * np.pi * doy_column['doy'] / 365)
doy_column['doy_cos'] = np.cos(2 * np.pi * doy_column['doy'] / 365)

# One-hot encode the 'crop' column
encoder = OneHotEncoder(sparse=False)
crop_encoded = encoder.fit_transform(crop_column)
crop_df = pd.DataFrame(crop_encoded, columns=encoder.get_feature_names_out(['crop']))

# Drop 'crop' and 'doy' columns and the specified 'field' from the original DataFrame
fields = model_input.drop(['crop', 'doy', 'leach'], axis=1)

# Scale the remaining numerical features
scaler = StandardScaler(with_mean=False)
scaled_fields = scaler.fit_transform(fields)
scaled_fields_df = pd.DataFrame(scaled_fields, columns=fields.columns)

# Concatenate the transformed 'crop', 'doy', and scaled numerical features
final_df = pd.concat([doy_column[['doy_sin', 'doy_cos']], crop_df, scaled_fields_df, nmin_column], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doy_column['doy_sin'] = np.sin(2 * np.pi * doy_column['doy'] / 365)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doy_column['doy_cos'] = np.cos(2 * np.pi * doy_column['doy'] / 365)
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_

In [13]:
# drops bands not in 4band spectrum and indices from them
fourband = final_df.drop(['band_1', 'band_3', 'band_5', 'band_7', 'NDRE', 'MCARI', 'LCCI'], axis = 1)

# drops rows with NA present (only occurs in 4band rows for extra bands)
NAmask  = final_df.apply(lambda x: x.notna().all(), axis=1)
eightband = final_df[NAmask]

# drops crop columns and bands not in 4band spectrum and indices from them 
nocrop = pd.concat([doy_column[['doy_sin', 'doy_cos']], scaled_fields_df, nmin_column], axis=1)
nocrop_four = nocrop.drop(['band_1', 'band_3', 'band_5', 'band_7', 'NDRE', 'MCARI', 'LCCI'], axis = 1)

# drops crop columns and rows with NA present (only occurs in 4band rows for extra bands))
nocrop_eight = nocrop[NAmask]

dfs = [
    [fourband, 'fourband'], 
    [eightband, 'eightband']
    # [nocrop_four, 'nocrop four'],
    # [nocrop_eight, 'nocrop eight']
    ]

yvar = 'leach'

In [14]:
models = [
    ("Logistic Regression", LogisticRegression(max_iter=10000)), # increased max_iter for convergence
    ("SVM", SVC(probability=True)), # set probability=True to ensure you can use methods like predict_proba if needed
    ("KNN", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("AdaBoost", AdaBoostClassifier(random_state=42))
]

for model_name, model_instance in models:
    print(f"\nModel: {model_name}\n{'-'*40}")
    for data, name in dfs:
        x = data.drop(yvar, axis=1)
        y = data[[yvar]]
        cn = data.columns #column names
        n = data.shape[0] #nrows

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(x, y.values.ravel(), test_size=0.3, random_state=42)

        # Train the model
        model_instance.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model_instance.predict(X_test)

        # Compute the confusion matrix
        conf_mat = confusion_matrix(y_test, y_pred)

        # Generate the classification report
        class_report = classification_report(y_test, y_pred)
        print("Classification Report:", name)
        print(class_report)

        # # Visualize the confusion matrix (Optional)
        # plt.figure(figsize=(8, 6))
        # sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues")
        # plt.xlabel('Predicted')
        # plt.ylabel('Actual')
        # plt.title('Confusion Matrix')
        # plt.show()


Model: Logistic Regression
----------------------------------------
Classification Report: fourband
              precision    recall  f1-score   support

           0       0.76      0.86      0.81        36
           1       0.67      0.50      0.57        20

    accuracy                           0.73        56
   macro avg       0.71      0.68      0.69        56
weighted avg       0.72      0.73      0.72        56

Classification Report: eightband
              precision    recall  f1-score   support

           0       0.87      0.81      0.84        16
           1       0.40      0.50      0.44         4

    accuracy                           0.75        20
   macro avg       0.63      0.66      0.64        20
weighted avg       0.77      0.75      0.76        20


Model: SVM
----------------------------------------
Classification Report: fourband
              precision    recall  f1-score   support

           0       0.64      1.00      0.78        36
           1      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report: eightband
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        16
           1       1.00      0.25      0.40         4

    accuracy                           0.85        20
   macro avg       0.92      0.62      0.66        20
weighted avg       0.87      0.85      0.81        20


Model: Decision Tree
----------------------------------------
Classification Report: fourband
              precision    recall  f1-score   support

           0       0.71      0.75      0.73        36
           1       0.50      0.45      0.47        20

    accuracy                           0.64        56
   macro avg       0.61      0.60      0.60        56
weighted avg       0.64      0.64      0.64        56

Classification Report: eightband
              precision    recall  f1-score   support

           0       0.79      0.69      0.73        16
           1       0.17      0.25      0.20         4

    accuracy         