# About 
In this notebook we compare the accuracy metrics obtained by the following models:
- model using only spectral bands (R,G,B,NIR) as features, trained on initial data
- model using the spectral bands + average and entropy calculated over a 13x13 pixel window as features, trained on initial data
- (final) model using the same features as previous one, but trained on an extended datset with information about false positives. 

Output is saved as a csv in the data/accuracy_data directory.

In [None]:
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from accuracy_info_df import accuracy_info_df

# Assume repository's parent directory is the home directory
home = os.path.expanduser("~")
os.chdir(os.path.join(home,'iceplant-detection-santa-barbara'))

In [None]:
# **************************************************************
# TRAIN MODELS
# **************************************************************
root_i = os.path.join(os.getcwd(), 
                    'data',
                    'iceplant_data',
                    'initial_dataset')
train_fp_i = os.path.join(root_i, 'train_2500.csv')

X_train = pd.read_csv(train_fp_i)
y_train = pd.read_csv(train_fp_i).loc[:,'iceplant'] 
y_train = y_train.to_numpy()

# ------------------------------
# spectral
bands = ['r','g','b','nir']
X_train_sub = X_train[bands].to_numpy()

rfc_spectral = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfc_spectral.fit(X_train_sub, y_train)

# ------------------------------
# 13x13

box_s = 13
window_features = [band + x + str(box_s) for band in bands+['ndvi'] for x in ['_avg', '_entr']]
cols_13x13 =  bands + ['ndvi'] + window_features + ['month', 'day_in_year']

X_train_sub = X_train[cols_13x13].to_numpy()

rfc_13x13 = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfc_13x13.fit(X_train_sub, y_train)
    
# ------------------------------
# final model (salt13_p30)
root_e = os.path.join(os.getcwd(), 
                    'data',
                    'iceplant_data',
                    'extended_dataset_final_model')
train_fp_e = os.path.join(root_e, 'extended_dataset_final_model_train.csv')

X_train_salt = pd.read_csv(train_fp_e)
y_train_salt = pd.read_csv(train_fp_e).loc[:,'iceplant'] 
y_train_salt = y_train_salt.to_numpy()


cols_salt = ['r', 
        'r_avg13', 'r_entr13',         
        'g',
        'g_avg13', 'g_entr13',                 
        'b',
        'b_avg13', 'b_entr13',                 
        'nir',
        'nir_avg13', 'nir_entr13',                 
        'ndvi',
        'ndvi_avg13', 'ndvi_entr13',        
        'month', 
        'day_in_year']

X_train_sub = X_train_salt[cols_salt].to_numpy()

rfc_salt = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfc_salt.fit(X_train_sub, y_train_salt)

rfcs = [rfc_spectral, rfc_13x13, rfc_salt]

In [None]:
# **************************************************************
# ACCURACIES WITH INITIAL TEST SET
# **************************************************************
test_fp_i = os.path.join(root_i, 'test_2500.csv')

X_test = pd.read_csv(test_fp_i)
y_test = pd.read_csv(test_fp_i).loc[:,'iceplant'] 
y_test = y_test.to_numpy()

results = []
for rfc, cols_names in zip(rfcs,[bands, cols_13x13, cols_salt]):
    
    X_test_sub = X_test[cols_names].to_numpy()
    preds = rfc.predict(X_test_sub)
    
    results.append(accuracy_info_df(y_test, preds))

# ------------------------------
labels = ['spectral', 'avg + entr 13x13', 'avg + entr 13x13 (augmented)']
R_initial = pd.concat(results)
R_initial.insert(loc = 0,
                 column = 'model',
                 value = labels)
# ------------------------------
# save

out_file_path = os.path.join(os.getcwd(),
                             'data',
                             'accuracy_data',
                             'accuracy_comparisons_initial_test_set.csv')
R_initial.to_csv(out_file_path, index=False)

In [None]:
# **************************************************************
# ACCURACIES WITH EXTENDED TEST SET
# **************************************************************
test_fp_e = os.path.join(root_e, 'extended_dataset_final_model_test.csv')

X_test = pd.read_csv(test_fp_e)
y_test = pd.read_csv(test_fp_e).loc[:,'iceplant'] 
y_test = y_test.to_numpy()

results_extended = []
for rfc, cols_names in zip(rfcs,[bands, cols_13x13, cols_salt]):
    
    X_test_sub = X_test[cols_names].to_numpy()
    preds = rfc.predict(X_test_sub)
    
    results_extended.append(accuracy_info_df(y_test, preds))
    
# ------------------------------
R_extended = pd.concat(results_extended)
R_extended.insert(loc = 0,
                 column = 'model',
                 value = labels)
# ------------------------------
# save
out_file_path = os.path.join(os.getcwd(),
                             'data',
                             'accuracy_data',
                             'accuracy_comparisons_extended_test_set.csv')
R_extended.to_csv(out_file_path, index=False)