## Init + Transform

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

import os
import sys
import joblib
import numpy as np
import pandas as pd
from joblib import dump
import subprocess as sp
# import dask.array as da
from pprint import pprint
import matplotlib.pyplot as plt
# from odc.io.cgroups import get_cpu_quota
from sklearn.model_selection import GridSearchCV, GroupKFold, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_curve, auc, balanced_accuracy_score, accuracy_score, f1_score, roc_auc_score
from sklearn.exceptions import FitFailedWarning

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", message="One or more of the test scores are non-finite:")
warnings.filterwarnings("ignore", message="Liblinear failed to converge, increase the number of iterations.")


In [2]:
model_input = pd.read_csv("data/sp_training_data.csv")
model_input.rename(columns={'nmin_90' : 'leach'}, inplace = True)

# Separate columns
crop_column = model_input[['crop']]
doy_column = model_input[['doy']]
cluster_column = model_input[['cluster']]

# Nmin conversion to binary
threshold = model_input['leach'].mean()
nmin_column = (model_input['leach'] > threshold).astype(int)

# Transform DOY into two dimensions using sine and cosine
doy_column['doy_sin'] = np.sin(2 * np.pi * doy_column['doy'] / 365)
doy_column['doy_cos'] = np.cos(2 * np.pi * doy_column['doy'] / 365)

# One-hot encode the 'crop' column
encoder = OneHotEncoder(sparse=False)
crop_encoded = encoder.fit_transform(crop_column)
crop_df = pd.DataFrame(crop_encoded, columns=encoder.get_feature_names_out(['crop']))

# Drop 'crop' and 'doy' columns and the specified 'field' from the original DataFrame
fields = model_input.drop(['crop', 'doy', 'leach', 'cluster'], axis=1)

# Scale the remaining numerical features
scaler = StandardScaler(with_mean=False)
scaled_fields = scaler.fit_transform(fields)
scaled_fields_df = pd.DataFrame(scaled_fields, columns=fields.columns)

# Concatenate the transformed 'crop', 'doy', and scaled numerical features
final_df = pd.concat([doy_column[['doy_sin', 'doy_cos']], crop_df, scaled_fields_df, cluster_column, nmin_column], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doy_column['doy_sin'] = np.sin(2 * np.pi * doy_column['doy'] / 365)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doy_column['doy_cos'] = np.cos(2 * np.pi * doy_column['doy'] / 365)


In [3]:
final_df['leach'].value_counts()

leach
0    121
1     64
Name: count, dtype: int64

## Group

In [4]:
# drops bands not in 4band spectrum and indices from them
fourband = final_df.drop(['band_1', 'band_3', 'band_5', 'band_7', 'NDRE', 'MCARI', 'LCCI'], axis = 1)

# drops rows with NA present (only occurs in 4band rows for extra bands)
NAmask  = final_df.apply(lambda x: x.notna().all(), axis=1)
eightband = final_df[NAmask]

# drops crop columns and bands not in 4band spectrum and indices from them 
nocrop = pd.concat([doy_column[['doy_sin', 'doy_cos']], scaled_fields_df, cluster_column, nmin_column], axis=1)
nocrop_four = nocrop.drop(['band_1', 'band_3', 'band_5', 'band_7', 'NDRE', 'MCARI', 'LCCI'], axis = 1)

# drops crop columns and rows with NA present (only occurs in 4band rows for extra bands))
nocrop_eight = nocrop[NAmask]


## GridSearch CV for Models

### Function

In [12]:
def perform_grid_search_cv(data, yvar, param_grid, Classifier, metric, cln):
    gkf = GroupKFold(n_splits=5)

    # Vars for data subset
    X = data.drop(yvar, axis=1)
    y = data[yvar]  # Changed to 1D array
    cluster = data['cluster'].values

    # Lists to store results of CV testing
    acc = []
    balacc = []
    f1 = []
    roc_auc = []

    # Determine best model for data and classifier using GridSearchCV
    clf = GridSearchCV(Classifier,
                        param_grid,
                        scoring=metric,
                        verbose=1,
                        cv=gkf.split(X, y, cluster),
                        n_jobs=-1)

    # Fit using all data to find the best parameters
    clf.fit(X, y)

    for train_index, test_index in gkf.split(X, y, cluster):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Use the best estimator from GridSearchCV
        best_model = clf.best_estimator_

        # Fit and predict
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        # Calculate metrics and append to lists
        acc.append(accuracy_score(y_test, y_pred))
        balacc.append(balanced_accuracy_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        roc_auc.append(roc_auc_score(y_test, y_pred))

    # Output mean of the metrics
    print(clf.best_estimator_)
    print(f"Mean Balanced Accuracy: {np.mean(balacc)}")
    print(f"Std Balanced Accuracy: {np.std(acc)}")
    print(f"N rows: {X.shape[0]}")
    # print(f"Mean Accuracy: {np.mean(acc)}")
    # print(f"Std Accuracy: {np.std(acc)}")
    # print(f"Mean F1 Score: {np.mean(f1)}")
    # print(f"Std F1 Score: {np.std(f1)}")
    # print(f"Mean ROC AUC: {np.mean(roc_auc)}")
    # print(f"Std ROC AUC: {np.std(roc_auc)}")


    # Store model
    final_model = clf.best_estimator_
    final_model.fit(X,y)

    model_name = "".join(["models/sp_s/", cln, "_model.joblib"])
    dump(final_model, model_name)

### Random Forest

In [13]:
param_grid = {
    'class_weight': ['balanced', None],
    'max_features': ['sqrt', 'log2', None],
    'n_estimators': [10, 25, 50, 500],
    'criterion': ['gini', 'entropy']
}

Classifier = RandomForestClassifier(random_state=0)
cl_name = "RF"
metric = 'balanced_accuracy' #see options at https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
yvar = 'leach'

perform_grid_search_cv(fourband, yvar, param_grid, Classifier, metric, cl_name)
print("-"*100)
perform_grid_search_cv(eightband, yvar, param_grid, Classifier, metric, cl_name)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
RandomForestClassifier(class_weight='balanced', max_features=None,
                       n_estimators=25, random_state=0)
Mean Balanced Accuracy: 0.7715859701735064
Std Balanced Accuracy: 0.10263909539092704
N rows: 185
----------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 48 candidates, totalling 240 fits
RandomForestClassifier(max_features=None, n_estimators=25, random_state=0)
Mean Balanced Accuracy: 0.6716583416583416
Std Balanced Accuracy: 0.11629619301331778
N rows: 64


### Logistic Regression

In [14]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.1, 0.5, 1, 5, 10],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'class_weight': [None, 'balanced']
}

Classifier = LogisticRegression(random_state=0)
cl_name = "LogReg"
metric = 'balanced_accuracy' #see options at https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
yvar = 'leach'

perform_grid_search_cv(fourband, yvar, param_grid, Classifier, metric, cl_name)
print("-"*100)
perform_grid_search_cv(eightband, yvar, param_grid, Classifier, metric, cl_name)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
LogisticRegression(C=5, class_weight='balanced', penalty='l1', random_state=0,
                   solver='liblinear')
Mean Balanced Accuracy: 0.7824777778398572
Std Balanced Accuracy: 0.06014579832242601
N rows: 185
----------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
LogisticRegression(C=5, penalty='l1', random_state=0, solver='liblinear')
Mean Balanced Accuracy: 0.7418181818181818
Std Balanced Accuracy: 0.07332554246959005
N rows: 64


### KNN

In [15]:
param_grid = {
    'n_neighbors': [2, 3, 4, 5, 10, 15],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['euclidean', 'manhattan']
}

Classifier = KNeighborsClassifier()
cl_name = "KNN"
metric = 'balanced_accuracy' #see options at https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
yvar = 'leach'

perform_grid_search_cv(fourband, yvar, param_grid, Classifier, metric, cl_name)
print("-"*100)
perform_grid_search_cv(eightband, yvar, param_grid, Classifier, metric, cl_name)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
KNeighborsClassifier(metric='manhattan', n_neighbors=4, weights='distance')
Mean Balanced Accuracy: 0.7585943186445034
Std Balanced Accuracy: 0.0647674642250535
N rows: 185
----------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 96 candidates, totalling 480 fits
KNeighborsClassifier(metric='manhattan')
Mean Balanced Accuracy: 0.6573076923076923
Std Balanced Accuracy: 0.07031029292898729
N rows: 64
