## Init + Transform

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

import os
import sys
import joblib
import numpy as np
import pandas as pd
from joblib import dump
import subprocess as sp
# import dask.array as da
from pprint import pprint
import matplotlib.pyplot as plt
# from odc.io.cgroups import get_cpu_quota
from sklearn.model_selection import GridSearchCV, GroupKFold, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_curve, auc, balanced_accuracy_score, accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.exceptions import FitFailedWarning

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", message="One or more of the test scores are non-finite:")
warnings.filterwarnings("ignore", message="Liblinear failed to converge, increase the number of iterations.")


In [4]:
model_input = pd.read_csv("data/dataset_1.csv")
model_input.rename(columns={'nmin_90' : 'leach'}, inplace = True)

# Separate columns
crop_column = model_input[['crop']]
doy_column = model_input[['doy']]
cluster_column = model_input[['cluster']]

# Nmin conversion to binary
threshold = model_input['leach'].mean()
nmin_column = (model_input['leach'] > threshold).astype(int)

# Transform DOY into two dimensions using sine and cosine
doy_column['doy_sin'] = np.sin(2 * np.pi * doy_column['doy'] / 365)
doy_column['doy_cos'] = np.cos(2 * np.pi * doy_column['doy'] / 365)

# One-hot encode the 'crop' column
encoder = OneHotEncoder(sparse=False)
crop_encoded = encoder.fit_transform(crop_column)
crop_df = pd.DataFrame(crop_encoded, columns=encoder.get_feature_names_out(['crop']))

# Drop columns not to be scaled
fields = model_input.drop(['crop', 'doy', 'leach', 'cluster', 'patch'], axis=1)

# Scale the remaining numerical features
scaler = StandardScaler(with_mean=False)
scaled_fields = scaler.fit_transform(fields)
scaled_fields_df = pd.DataFrame(scaled_fields, columns=fields.columns)

# Concatenate the transformed 'crop', 'doy', and scaled numerical features
final_df = pd.concat([doy_column[['doy_sin', 'doy_cos']], crop_df, scaled_fields_df, cluster_column, nmin_column], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doy_column['doy_sin'] = np.sin(2 * np.pi * doy_column['doy'] / 365)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doy_column['doy_cos'] = np.cos(2 * np.pi * doy_column['doy'] / 365)


## Group

In [5]:
# drops bands not in 4band spectrum and indices from them
fourband = final_df.drop(['band_1', 'band_3', 'band_5', 'band_7', 'NDRE', 'MCARI', 'LCCI'], axis = 1)

# drops rows with NA present (only occurs in 4band rows for extra bands)
NAmask  = final_df.apply(lambda x: x.notna().all(), axis=1)
eightband = final_df[NAmask]

# drops crop columns and bands not in 4band spectrum and indices from them 
nocrop = pd.concat([doy_column[['doy_sin', 'doy_cos']], scaled_fields_df, cluster_column, nmin_column], axis=1)
nocrop_four = nocrop.drop(['band_1', 'band_3', 'band_5', 'band_7', 'NDRE', 'MCARI', 'LCCI'], axis = 1)

# drops crop columns and rows with NA present (only occurs in 4band rows for extra bands))
nocrop_eight = nocrop[NAmask]


## GridSearch CV for Models

### Function

In [6]:
def perform_grid_search_cv(data, yvar, param_grid, Classifier, metric, cln):
    gkf = GroupKFold(n_splits=6)

    # Vars for data subset
    X = data.drop(yvar, axis=1)
    y = data[yvar]  # Changed to 1D array
    cluster = data['cluster'].values

    # Determine best model for data and classifier using GridSearchCV
    clf = GridSearchCV(Classifier,
                       param_grid,
                       scoring=metric,
                       verbose=1,
                       cv=gkf.split(X, y, cluster),
                       n_jobs=-1)

    # Fit using all data to find the best parameters
    clf.fit(X, y)
    y_pred = clf.best_estimator_.predict(X)
    cm = confusion_matrix(y, y_pred, labels= [1,0])
    cm = cm / X.shape[0]

    # Output mean of the metrics
    print(clf.best_estimator_)
    print(f"N rows: {X.shape[0]}")
    print(f"Balanced Accuracy: {clf.best_score_}")
    print(f"Total No Leach Points: {final_df['leach'].value_counts()[0]}")
    print(f"Total Leach Points: {final_df['leach'].value_counts()[1]}")
    print("Confusion Matrix:")
    print(cm)

    # Store model
    final_model = clf.best_estimator_
    final_model.fit(X,y)

    model_name = "".join(["models/m1/", cln, "_m1.joblib"])
    dump(final_model, model_name)

### Random Forest

In [8]:
param_grid = {
    'class_weight': ['balanced', None],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None,10,50,100],
    'n_estimators': [100,200,500],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

Classifier = RandomForestClassifier(random_state=0)
cl_name = "RF"
metric = 'balanced_accuracy' #see options at https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
yvar = 'leach'

perform_grid_search_cv(fourband, yvar, param_grid, Classifier, metric, cl_name)
print("-"*100)
# perform_grid_search_cv(eightband, yvar, param_grid, Classifier, metric, cl_name)

Fitting 6 folds for each of 1296 candidates, totalling 7776 fits
RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=0)
N rows: 177
Balanced Accuracy: 0.7586093402741
Total No Leach Points: 116
Total Leach Points: 61
Confusion Matrix:
[[0.34463277 0.        ]
 [0.         0.65536723]]
----------------------------------------------------------------------------------------------------


### Logistic Regression

In [9]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.1, 0.5, 1, 5, 10],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'class_weight': [None, 'balanced']
}

Classifier = LogisticRegression(max_iter = 1000, random_state=0)
cl_name = "LogReg"
metric = 'balanced_accuracy' #see options at https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
yvar = 'leach'

perform_grid_search_cv(fourband, yvar, param_grid, Classifier, metric, cl_name)
print("-"*100)
# perform_grid_search_cv(eightband, yvar, param_grid, Classifier, metric, cl_name)

Fitting 6 folds for each of 200 candidates, totalling 1200 fits
LogisticRegression(C=5, max_iter=1000, random_state=0, solver='newton-cg')
N rows: 177
Balanced Accuracy: 0.772743005151472
Total No Leach Points: 116
Total Leach Points: 61
Confusion Matrix:
[[0.25423729 0.09039548]
 [0.03954802 0.61581921]]
----------------------------------------------------------------------------------------------------




### KNN

In [10]:
param_grid = {
    'n_neighbors': [2, 3, 4, 5, 10, 15],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['euclidean', 'manhattan']
}

Classifier = KNeighborsClassifier()
cl_name = "KNN"
metric = 'balanced_accuracy' #see options at https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
yvar = 'leach'

perform_grid_search_cv(fourband, yvar, param_grid, Classifier, metric, cl_name)
print("-"*100)
# perform_grid_search_cv(eightband, yvar, param_grid, Classifier, metric, cl_name)

Fitting 6 folds for each of 96 candidates, totalling 576 fits
KNeighborsClassifier(metric='manhattan', n_neighbors=4, weights='distance')
N rows: 177
Balanced Accuracy: 0.7735230645699752
Total No Leach Points: 116
Total Leach Points: 61
Confusion Matrix:
[[0.34463277 0.        ]
 [0.         0.65536723]]
----------------------------------------------------------------------------------------------------
