# About

This is a notebook to train a random forst and calculate accuracy metrics using the iceplant data. The parameters are set up to create the final model for the project.


**NOTEBOOK PARAMETERS**
- `train_file_path` (str): file path to the training dataset
- `test_file_path` (str): file path to the test dataset
- `label_name` (str): name of the column containing the class data (iceplant/other vegetation)
- `cols` (array of str): name of the columns that will be used as features
- `save_model` (bool): whether to save the model or not
- `model_name` (str): name of the ouput model


In [1]:
import os
import time
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix

from joblib import dump

In [2]:
def print_accuracy_info(y_true,y_pred):
    """A function to print the accuracies calculated from the confusion matrix. """
    N = y_true.shape[0]
    
    confmtx = confusion_matrix(y_true,y_pred)
    
    print('true negatives:', confmtx[0,0], 
          '    false positives:', confmtx[0,1])
    print('false negatives:', confmtx[1,0], 
          '    true positives:', confmtx[1,1])
    print()
    unique, counts = np.unique(y_true,return_counts=True)
    
    sens =  confmtx[1,1]/counts[1]
    spec =  confmtx[0,0]/counts[0]
    print("P producer's accuracy (TP/P):", np.round(sens*100,2), '%')  
    prec = confmtx[1,1]/(confmtx[1,1]+confmtx[0,1])
    print("P user's accuracy (TP/(TP+FP)):", np.round(prec*100,2),'%' )    
    print("N producer's accuracy (TN/N):", np.round(spec*100,2), '%')      
    prec = confmtx[0,0]/(confmtx[0,0]+confmtx[1,0])
    print("N user's accuracy (TN/(TN+FN)):", np.round(prec*100,2),'%' )
    print()    
    print('overall accuracy:', np.round( (confmtx[1,1] + confmtx[0,0])/y_true.shape[0]*100,2),'%') # (TP + TN)/(P + N)
    print()
        
    return

In [3]:
# Assume repository's parent directory is the home directory
home = os.path.expanduser("~")
os.chdir(os.path.join(home,'iceplant-detection-santa-barbara'))

# ******************************************************
# ******************* PARAMETERS ***********************
root = os.path.join(os.getcwd(), 
                    'data',
                    'iceplant_data',
                    'extended_dataset_final_model')
train_file_path = os.path.join(root, 'extended_dataset_final_model_train.csv')
test_file_path = os.path.join(root, 'extended_dataset_final_model_test.csv')

label_name = 'iceplant'

save_model = False
model_name = 'final_model'

cols = ['r', 
        'r_avg13', 'r_entr13',         
        'g',
        'g_avg13', 'g_entr13',                 
        'b',
        'b_avg13', 'b_entr13',                 
        'nir',
        'nir_avg13', 'nir_entr13',                 
        'ndvi',
        'ndvi_avg13', 'ndvi_entr13',        
        'month', 
        'day_in_year']

In [4]:
# ------------------------------
# Import data
X_train = pd.read_csv(train_file_path)#.loc[:, first_feature:last_feature]
y_train = pd.read_csv(train_file_path).loc[:,label_name] 

X_test = pd.read_csv(test_file_path)#.loc[:, first_feature:last_feature]
y_test = pd.read_csv(test_file_path).loc[:,label_name] 

# 
X_test = X_test[cols]
X_train = X_train[cols] 

# ------------------------------
X_train.head()

Unnamed: 0,r,r_avg13,r_entr13,g,g_avg13,g_entr13,b,b_avg13,b_entr13,nir,nir_avg13,nir_entr13,ndvi,ndvi_avg13,ndvi_entr13,month,day_in_year
0,99,125.31953,4.960809,111,120.04734,4.493296,86,109.18343,4.588089,151,130.84616,5.143071,0.208,0.017751,4.332828,5,143
1,112,112.84024,5.189259,115,115.54438,4.184464,92,92.01183,4.712946,165,159.50296,4.906877,0.191336,0.171598,4.743557,5,143
2,115,135.13017,5.531252,114,128.21301,4.939129,95,113.74556,5.047765,134,138.84024,5.087237,0.076305,0.011834,3.926158,5,143
3,126,123.15385,4.325254,131,126.15385,4.342409,100,93.59763,4.493142,156,153.05917,4.693632,0.106383,0.106509,3.950789,5,143
4,123,114.51479,4.893226,115,112.21893,4.484502,102,95.40237,4.368136,146,144.80473,5.283693,0.085502,0.112426,4.524214,5,143


In [5]:
# check columns from test and train match
X_test.columns == X_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [6]:
# train model
rfc = RandomForestClassifier(n_estimators = 100, 
                             random_state = 42)# fixed for reproducibility
rfc.fit(X_train.to_numpy(), y_train.to_numpy())

# save model if needed
if save_model:
    out_file_path = os.path.join(os.getcwd(),
                                 'code',
                                 'B_model_training',
                                 model_name +'.joblib')
    dump(rfc, out_file_path)

# calculate and print accuracy metrics
preds = rfc.predict(X_test.to_numpy())
print_accuracy_info(y_test.to_numpy(), preds)

true negatives: 924     false positives: 76
false negatives: 205     true positives: 420

P producer's accuracy (TP/P): 67.2 %
P user's accuracy (TP/(TP+FP)): 84.68 %
N producer's accuracy (TN/N): 92.4 %
N user's accuracy (TN/(TN+FN)): 81.84 %

overall accuracy: 82.71 %

