# About

This is a notebook to train a random forst and calculate accuracy metrics using the iceplant data. 


**NOTEBOOK PARAMETERS**
- `train_file_path` (str): file path to the training dataset
- `test_file_path` (str): file path to the test dataset
- `label_name` (str): name of the column containing the class data (iceplant/other vegetation)
- `cols` (array of str): name of the columns that will be used as features
- `save_model` (bool): whether to save the model or not
- `model_name` (str): name of the ouput model


In [1]:
import os
import time
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix

from joblib import dump

In [2]:
def print_accuracy_info(y_true,y_pred):
    """A function to print the accuracies calculated from the confusion matrix. """
    N = y_true.shape[0]
    
    confmtx = confusion_matrix(y_true,y_pred)
    
    print('true negatives:', confmtx[0,0], 
          '    false positives:', confmtx[0,1])
    print('false negatives:', confmtx[1,0], 
          '    true positives:', confmtx[1,1])
    print()
    unique, counts = np.unique(y_true,return_counts=True)
    
    sens =  confmtx[1,1]/counts[1]
    spec =  confmtx[0,0]/counts[0]
    print("P producer's accuracy (TP/P):", np.round(sens*100,2), '%')  
    prec = confmtx[1,1]/(confmtx[1,1]+confmtx[0,1])
    print("P user's accuracy (TP/(TP+FP)):", np.round(prec*100,2),'%' )    
    print("N producer's accuracy (TN/N):", np.round(spec*100,2), '%')      
    prec = confmtx[0,0]/(confmtx[0,0]+confmtx[1,0])
    print("N user's accuracy (TN/(TN+FN)):", np.round(prec*100,2),'%' )
    print()    
    print('overall accuracy:', np.round( (confmtx[1,1] + confmtx[0,0])/y_true.shape[0]*100,2),'%') # (TP + TN)/(P + N)
    print()
        
    return

In [3]:
# Assume repository's parent directory is the home directory
home = os.path.expanduser("~")
os.chdir(os.path.join(home,'iceplant-detection-santa-barbara'))

# ******************************************************
# ******************* PARAMETERS ***********************
root = os.path.join(os.getcwd(), 
                    'data',
                    'iceplant_data',
                    'initial_dataset')
train_file_path = os.path.join(root, 'train_2500.csv')
test_file_path = os.path.join(root, 'test_2500.csv')

label_name = 'iceplant'

save_model = False
model_name = 'initial'

cols = ['r',      
        'g',
        'b',
        'nir']

In [4]:
# ------------------------------
# Import data
X_train = pd.read_csv(train_file_path)#.loc[:, first_feature:last_feature]
y_train = pd.read_csv(train_file_path).loc[:,label_name] 

X_test = pd.read_csv(test_file_path)#.loc[:, first_feature:last_feature]
y_test = pd.read_csv(test_file_path).loc[:,label_name] 

# 
X_test = X_test[cols]
X_train = X_train[cols] 

# ------------------------------
X_train.head()

Unnamed: 0,r,g,b,nir
0,57,84,65,172
1,92,99,78,169
2,84,95,69,166
3,86,88,75,130
4,78,90,68,164


In [5]:
# check columns from test and train match
X_test.columns == X_train.columns

array([ True,  True,  True,  True])

In [6]:
# train model
rfc = RandomForestClassifier(n_estimators = 100, 
                             random_state = 42)# fixed for reproducibility
rfc.fit(X_train.to_numpy(), y_train.to_numpy())

# save model if needed
if save_model:
    dump(rfc, model_name +'.joblib')

# calculate and print accuracy metrics
preds = rfc.predict(X_test.to_numpy())
print_accuracy_info(y_test.to_numpy(), preds)

true negatives: 343     false positives: 77
false negatives: 59     true positives: 271

P producer's accuracy (TP/P): 82.12 %
P user's accuracy (TP/(TP+FP)): 77.87 %
N producer's accuracy (TN/N): 81.67 %
N user's accuracy (TN/(TN+FN)): 85.32 %

overall accuracy: 81.87 %

