<a href="https://colab.research.google.com/github/anniebritton/Eco-Drought-South-Dakota/blob/main/NDVI_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Workbook Setup**

In [None]:
# installs and import libraries
!pip install matplotlib
!pip install scikit-learn
!pip install lazypredict
!pip install shap

import pandas as pd
import numpy as np

# Scikit Learn
from sklearn.model_selection import KFold, cross_val_score
from sklearn.utils import shuffle

# Other Models
from xgboost import XGBRegressor
import lightgbm as lgb

# Tools
import lazypredict
import shap

In [3]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Import Data**

In [4]:
normalized_df = pd.read_csv('/content/drive/MyDrive/School/M.S./Courses/Capstone/Colab/Data/CSVs/NDVI_clean_preprocessed.csv')

In [None]:
normalized_df['date'] = pd.to_datetime(normalized_df['date'])
normalized_df = normalized_df.set_index('date')
normalized_df

#**Prelim ML Tests**

In [8]:
# try a random forest classifier
from sklearn.ensemble import RandomForestClassifier

# add in a boolean drought column (1 for drought, 0 for no drought)
normalized_df['drought_bool'] = np.where(normalized_df["pdsi_anomaly_roll"] >= 0, 0, 1)

X = normalized_df.iloc[:,0:15].values
Y = normalized_df.iloc[:,15:16].values.ravel()

clf = RandomForestClassifier(n_estimators=100)

# SUBSET YOUR X AND Y INTO TRAIN/TEST SETS; for instance, take first 80% of the rows as training, last 20% as test
X_train = normalized_df.iloc[0:1272, 0:15].values
Y_train = normalized_df.iloc[0:1272, 15:16].values.ravel()
X_test = normalized_df.iloc[1272:1590, 0:15].values
Y_test = normalized_df.iloc[1272:1590, 15:16].values

clf.fit(X_train, Y_train)
Y_predicted = clf.predict(X_test)

# compare Y_predicted with Y_test
clf.score(X_test, Y_test)

1.0

In [9]:
# try a decision tree regressor
from sklearn import tree

X = normalized_df.iloc[:,1:15].values
Y = normalized_df.iloc[:,0:1].values.ravel()

clf = tree.DecisionTreeRegressor()

X_train = normalized_df.iloc[0:1272, 1:15].values
Y_train = normalized_df.iloc[0:1272, 0:1].values.ravel()
X_test = normalized_df.iloc[1272:1590, 1:15].values
Y_test = normalized_df.iloc[1272:1590, 0:1].values

clf.fit(X_train, Y_train)
Y_predicted = clf.predict(X_test)

# compare Y_predicted with Y_test
clf.score(X_test, Y_test)

-0.20936093435004133

In [10]:
# try support vector regression
from sklearn import svm

X = normalized_df.iloc[:,1:15].values
Y = normalized_df.iloc[:,0:1].values.ravel()

regr = svm.SVR()

X_train = normalized_df.iloc[0:1272, 1:15].values
Y_train = normalized_df.iloc[0:1272, 0:1].values.ravel()
X_test = normalized_df.iloc[1272:1590, 1:15].values
Y_test = normalized_df.iloc[1272:1590, 0:1].values

regr.fit(X_train, Y_train)
Y_predicted = regr.predict(X_test)

# compare Y_predicted with Y_test
regr.score(X_test, Y_test)

0.26569359098678746

#**Trying Out LazyPredict**

In [13]:
# install and import
from lazypredict.Supervised import LazyRegressor

# Regression
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)

X_train = normalized_df.iloc[0:1272, 0:15].values
y_train = normalized_df.iloc[0:1272, 15:16].values.ravel()

X_test = normalized_df.iloc[1272:1590, 0:15].values
y_test = normalized_df.iloc[1272:1590, 15:16].values

rmodel, rprediction = reg.fit(X_train, X_test, y_train, y_test)

rmodel

 19%|█▉        | 8/42 [00:00<00:01, 23.44it/s]

GammaRegressor model failed to execute
Some value(s) of y are out of the valid range of the loss 'HalfGammaLoss'.


100%|██████████| 42/42 [00:47<00:00,  1.13s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,1.0,1.0,0.0,0.02
DecisionTreeRegressor,1.0,1.0,0.0,0.03
BaggingRegressor,1.0,1.0,0.0,0.04
ExtraTreeRegressor,1.0,1.0,0.0,0.01
GradientBoostingRegressor,1.0,1.0,0.0,0.36
XGBRegressor,1.0,1.0,0.0,0.33
RandomForestRegressor,1.0,1.0,0.0,0.45
HistGradientBoostingRegressor,1.0,1.0,0.02,3.15
LGBMRegressor,0.99,0.99,0.05,0.34
ExtraTreesRegressor,0.99,0.99,0.05,0.21


#**K-fold Cross Validation for LazyPredict Regression**

In [14]:
from sklearn.model_selection import KFold

def k_fold_lp(data, target, k=5):
    # Create a KFold object with k folds
    kf = KFold(n_splits=k, shuffle = True, random_state=42)
    # Create an empty df to store scores for each fold
    scores = pd.DataFrame()
    # Loop over each fold
    for train_idx, test_idx in kf.split(data):
        # Split the data into train and test sets for this fold
        X_train, X_test = data[train_idx], data[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]
        # Create a LazyRegressor model
        reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
        # Fit the model on the train data and make predictions on the test data
        models, predictions = reg.fit(X_train, X_test, y_train, y_test)
        # Append the score to the list of scores for this fold
        scores = scores.append(predictions)
    # Calculate the mean of all scores across all folds
    return scores

In [None]:
X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()

mean_score = k_fold_lp(X, y)
mean_score = mean_score.sort_values(by = "Adjusted R-Squared", ascending = False)

In [16]:
mean_score.groupby("Model").mean().sort_values(by = "Adjusted R-Squared", ascending = False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GaussianProcessRegressor,0.93,0.93,0.26,0.3
ExtraTreesRegressor,0.93,0.93,0.26,0.94
XGBRegressor,0.89,0.89,0.32,0.61
HistGradientBoostingRegressor,0.89,0.89,0.32,0.8
LGBMRegressor,0.89,0.89,0.32,0.24
RandomForestRegressor,0.87,0.88,0.34,2.25
BaggingRegressor,0.86,0.86,0.37,0.22
KNeighborsRegressor,0.82,0.82,0.42,0.03
GradientBoostingRegressor,0.78,0.79,0.46,0.93
MLPRegressor,0.77,0.78,0.46,2.28


#**K-Fold on Individual Models**

In [17]:
from sklearn.gaussian_process import GaussianProcessRegressor

X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()
#X, y = shuffle(X, y, random_state=42)

# Define your model
gpr_model = GaussianProcessRegressor()

# Define the number of folds for cross validation
num_folds = 5

# Define the k-fold cross validation object
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Run k-fold cross validation on the model
results = cross_val_score(gpr_model, X, y, cv=kfold, scoring='r2')

# Print the mean and standard deviation of the results
print("Results:", results)
print("Mean Result:", results.mean())

Results: [0.90848108 0.93312081 0.95463627 0.95008162 0.91378848]
Mean Result: 0.9320216516154772


In [18]:
from sklearn.ensemble import ExtraTreesRegressor

X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()
#X, y = shuffle(X, y, random_state=42)

# Define your model
etr_model = ExtraTreesRegressor()

# Define the number of folds for cross validation
num_folds = 5

# Define the k-fold cross validation object
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Run k-fold cross validation on the model
results = cross_val_score(etr_model, X, y, cv=kfold, scoring='r2')

# Print the mean and standard deviation of the results
print("Results:", results)
print("Mean Result:", results.mean())

Results: [0.92793287 0.9425536  0.92334996 0.94433627 0.91302428]
Mean Result: 0.9302393977252572


In [19]:
import xgboost as xgb

X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()
#X, y = shuffle(X, y, random_state=42)

# Define your model
xgb_model = xgb.XGBRegressor()

# Define the number of folds for cross validation
num_folds = 5

# Define the k-fold cross validation object
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Run k-fold cross validation on the model
results = cross_val_score(xgb_model, X, y, cv=kfold, scoring='r2')

# Print the mean and standard deviation of the results
print("Results:", results)
print("Mean Result:", results.mean())

Results: [0.87942551 0.91035714 0.85577327 0.92265321 0.8955487 ]
Mean Result: 0.8927515641699184


In [20]:
import lightgbm as lgb

X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()
#X, y = shuffle(X, y, random_state=42)

# Define your model
lgb_model = lgb.LGBMRegressor()

# Define the number of folds for cross validation
num_folds = 5

# Define the k-fold cross validation object
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Run k-fold cross validation on the model
results = cross_val_score(lgb_model, X, y, cv=kfold, scoring='r2')

# Print the mean and standard deviation of the results
print("Results:", results)
print("Mean Result:", results.mean())

Results: [0.87286995 0.91336448 0.84863566 0.92111921 0.88985692]
Mean Result: 0.8891692448349631


#**Trying Different Cross Val Strategies**

In [21]:
from sklearn.model_selection import ShuffleSplit

X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()

# Define your model
gpr_model = GaussianProcessRegressor()

# Define the cross validation object
ss = ShuffleSplit(n_splits=5, test_size=0.10, random_state=42)

# Run k-fold cross validation on the model
results = cross_val_score(gpr_model, X, y, cv=ss, scoring='r2')

# Print the mean and standard deviation of the results
print("Results:", results)
print("Mean Result:", results.mean())

Results: [0.93139087 0.9318324  0.94931258 0.92927714 0.94937186]
Mean Result: 0.93823696818901


In [22]:
from sklearn.model_selection import ShuffleSplit

X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()

# Define your model
etr_model = ExtraTreesRegressor()

# Define the cross validation object
ss = ShuffleSplit(n_splits=5, test_size=0.10, random_state=42)

# Run k-fold cross validation on the model
results = cross_val_score(etr_model, X, y, cv=ss, scoring='r2')

# Print the mean and standard deviation of the results
print("Results:", results)
print("Mean Result:", results.mean())

Results: [0.94060216 0.94445671 0.94055004 0.93703941 0.95050994]
Mean Result: 0.9426316514581551


In [23]:
import xgboost as xgb

X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()

# Define your model
xgb_model = xgb.XGBRegressor()

# Define the number of folds for cross validation
num_folds = 5

# Define the cross validation object
ss = ShuffleSplit(n_splits=5, test_size=0.20, random_state=42)

# Run k-fold cross validation on the model
results = cross_val_score(xgb_model, X, y, cv=ss, scoring='r2')

# Print the mean and standard deviation of the results
print("Results:", results)
print("Mean Result:", results.mean())

Results: [0.87942551 0.89157883 0.90930349 0.89146379 0.88331694]
Mean Result: 0.891017712220558


#**Hyperparameter Optimization** - Random Search

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [28]:
from xgboost import XGBRegressor

# Load your data
X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()

# Define the distribution of hyperparameters to search over
param_dist = {
    'learning_rate': uniform(0.03, 0.09),
    'max_depth': randint(4, 10),
    'min_child_weight': randint(1, 8),
    'subsample': uniform(0.5, 0.5), # fix this
    'colsample_bytree': uniform(0.45, 0.7),
    'n_estimators': randint(300, 600)
}

# Define the regression model
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the random search object
random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, cv=5, n_iter=50, random_state=42, verbose=2)

# Fit the random search to the training data
random_search.fit(X, y)

# Print the best hyperparameters and the corresponding mean squared error
print("Best hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END colsample_bytree=0.7121780831931537, learning_rate=0.11556428757689245, max_depth=6, min_child_weight=5, n_estimators=320, subsample=0.5780093202212182; total time=   1.9s
[CV] END colsample_bytree=0.7121780831931537, learning_rate=0.11556428757689245, max_depth=6, min_child_weight=5, n_estimators=320, subsample=0.5780093202212182; total time=   4.0s
[CV] END colsample_bytree=0.7121780831931537, learning_rate=0.11556428757689245, max_depth=6, min_child_weight=5, n_estimators=320, subsample=0.5780093202212182; total time=   3.9s
[CV] END colsample_bytree=0.7121780831931537, learning_rate=0.11556428757689245, max_depth=6, min_child_weight=5, n_estimators=320, subsample=0.5780093202212182; total time=   1.6s
[CV] END colsample_bytree=0.7121780831931537, learning_rate=0.11556428757689245, max_depth=6, min_child_weight=5, n_estimators=320, subsample=0.5780093202212182; total time=   0.8s
[CV] END colsample_bytree=0.55919

In [None]:
# Best hyperparameters:  {'colsample_bytree': 0.5367663185416978, 'learning_rate': 0.09938704619591049, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 306, 'subsample': 0.7137705091792748}
# Best score:  0.4665033791008263



In [29]:
from sklearn.ensemble import ExtraTreesRegressor

# Load your data
X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()

# Define the distribution of hyperparameters to search over
param_dist = {
    'n_estimators': randint(10, 1000),
    'max_depth': [None] + list(range(1, 30)),
    'min_samples_split': randint(2, 50),
    'min_samples_leaf': randint(1, 50),
    'max_features': [1.0, 'sqrt', 'log2', None] + list(range(1, 10)),
    'bootstrap': [True, False],
}

# Define the regression model
etr = ExtraTreesRegressor(random_state=42)

# Define the random search object
random_search = RandomizedSearchCV(etr, param_distributions=param_dist, cv=5, n_iter=50, random_state=42, verbose=2)

# Fit the random search to the training data
random_search.fit(X, y)

# Print the best hyperparameters and the corresponding mean squared error
print("Best hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END bootstrap=True, max_depth=19, max_features=9, min_samples_leaf=15, min_samples_split=44, n_estimators=81; total time=   0.1s
[CV] END bootstrap=True, max_depth=19, max_features=9, min_samples_leaf=15, min_samples_split=44, n_estimators=81; total time=   0.2s
[CV] END bootstrap=True, max_depth=19, max_features=9, min_samples_leaf=15, min_samples_split=44, n_estimators=81; total time=   0.1s
[CV] END bootstrap=True, max_depth=19, max_features=9, min_samples_leaf=15, min_samples_split=44, n_estimators=81; total time=   0.1s
[CV] END bootstrap=True, max_depth=19, max_features=9, min_samples_leaf=15, min_samples_split=44, n_estimators=81; total time=   0.2s
[CV] END bootstrap=True, max_depth=20, max_features=3, min_samples_leaf=19, min_samples_split=24, n_estimators=340; total time=   0.6s
[CV] END bootstrap=True, max_depth=20, max_features=3, min_samples_leaf=19, min_samples_split=24, n_estimators=340; total time=   0.5

In [None]:
# update script so that it puts out a log file with a list of all the params across searches and best hyperparameters
# .fit has an option for a log file

#**Hyperparameter Optimization** - Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Load your data
X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()

# Best hyperparameters from Random Search:  {'colsample_bytree': 0.5367663185416978, 'learning_rate': 0.09938704619591049, 
#                                             'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 306, 'subsample': 0.7137705091792748}

# Define the distribution of hyperparameters to search over
param_dist = {
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [6],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.5, 0.6, 0.7],
    'n_estimators': [300, 600]
}

# Define the regression model
xgb = XGBRegressor(objective='reg:squarederror')

# Define the search object
grid_search = GridSearchCV(xgb, param_dist, cv=5)

# Fit the random search to the training data
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding mean squared error
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
# Load your data
X = normalized_df.iloc[:, 1:15].values
y = normalized_df.iloc[:, 0:1].values.ravel()

# Best hyperparameters from Random Search:  {'bootstrap': False, 'max_depth': 22, 'max_features': None, 
#                                             'min_samples_leaf': 11, 'min_samples_split': 18, 'n_estimators': 776}

# Define the distribution of hyperparameters to search over
param_dist = {
    'n_estimators': [700, 800, 900],
    'max_depth': [20, 25, 30],
    'min_samples_split': [16, 20, 24],
    'min_samples_leaf': [10, 12, 14],
    'max_features': [1.0, 'sqrt', 'log2', None],
    'bootstrap': [True, False],
}

# Define the regression model
etr = ExtraTreesRegressor()

# Define the search object
grid_search = GridSearchCV(xgb, param_dist, cv=5)

# Fit the random search to the training data
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding mean squared error
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)