Annie Taylor
2.21.22

## Extract AUC values from SAHM model output files
SAHM outputs key output values to text files within each model's folder. This script iterates through each models output files to find the key assessment values and then adds them to three separate csv files for comparison: AUC, percent correctly classified, and mean variable importance.

#### Import packages

In [73]:
import pandas as pd
import numpy as np
import os
import csv
from IPython.display import display

#### Set Parameters

In [74]:
# The species to investigate, can do one at a time or iterate over many species
codes = ['QUKE']
# codes = ['CHPO','QUKE','RUUR']

# models to iterate over, folders are automatically created by SAHM based on the models you run
# folders = ['brt_1', 'glm_1', 'mars_1', 'rf_1', 'Maxent_1']
models = ['brt', 'glm', 'mars', 'rf', 'Maxent']

#### Get AUC values for each model (test/train and cross validation runs) 

In [75]:
for code in codes: 
    # SAHM results folder, change this to your file path
    results = r"D:/1_AMLT/1_SDM/SAHM/" + code + "/"
    # models = models[:1]
    AUCresults = {}
    AUCtrainResults = {}

    # Loop through all of the folders to extract the test/train and CV AUCs one at a time
    for model in models: 
        filepath = results + model + '_1/' + model + '_output.txt'
        f = open(filepath, 'r') # open in read mode
        content = f.readlines()
        # find the list index (line) with 'test/training split'
        ind1 = content.index('Evaluation Statistics applied to train split:\n')
        # find the list index (line) with 'crossValidation'
        ind = content.index('Evaluation Statistics applied to crossValidation split:\n')
        substring = 'AUC'

        # first get the test/train AUC
        AUCtrainline = [string for string in content[ind1:ind] if substring in string][0]
        AUCtrainval = AUCtrainline[34:41]
        # add value to dictionary using model name as the key
        AUCtrainResults[model] = float(AUCtrainval)

        # then get the cross validation AUC
        AUCline = [string for string in content[ind:] if substring in string][0]
        # extract AUC value from that string, average across models
        AUCval = AUCline[34:41]
        # add value to dictionary using model name as the key
        AUCresults[model] = float(AUCval)

        f.close()

    # print both dictionaries
    print('AUC values')

    AUCtrainResults['mean'] = np.array(list(AUCtrainResults.values())).mean()
    print('test/train', AUCtrainResults)

    AUCresults['mean'] = np.array(list(AUCresults.values())).mean()
    print('cross validation', AUCresults)

AUC values
test/train {'brt': 0.9181, 'glm': 0.8534, 'mars': 0.8648, 'rf': 0.868, 'Maxent': 0.8939, 'mean': 0.87964}
cross validation {'brt': 0.86031, 'glm': 0.8468, 'mars': 0.85378, 'rf': 0.86916, 'Maxent': 0.85823, 'mean': 0.8576560000000001}


#### Write these results to an existing csv file for comparisons

In [6]:
file = r'D:/1_AMLT/1_SDM/SAHM/AUC_Comparisons.csv'

with open(file, 'a') as f:
    # add the test/training AUC to csv
    for key in AUCtrainResults.keys():
        f.write(code + '_' + key + '_AUC_train' + ',')
    f.write('\n')
    for key in AUCtrainResults.keys():
        f.write(str(AUCtrainResults[key]) + ',')
    f.write('\n')
    
    # add the cross validation PCC to csv
    for key in AUCresults.keys():
        f.write(code + '_' + key + '_AUC_CV' + ',')
    f.write('\n')
    for key in AUCresults.keys():
        f.write(str(AUCresults[key]) + ',')
    f.write('\n')

#### Get the percent correctly classified in test/train and CV runs

In [76]:
for code in codes: 
    # SAHM results folder
    results = r"D:/1_AMLT/1_SDM/SAHM/" + code + "/"
    # models = models[:1]
    PCCresults = {}
    PCCtrainResults = {}

    # Loop through all of the folders to extract the test/train and CV AUCs one at a time
    for model in models: 
        filepath = results + model + '_1/' + model + '_output.txt'
        f = open(filepath, 'r') # open in read mode
        content = f.readlines()
        # find the list index (line) with 'test/training split'
        ind1 = content.index('Evaluation Statistics applied to train split:\n')
        # find the list index (line) with 'crossValidation'
        ind = content.index('Evaluation Statistics applied to crossValidation split:\n')
        substring = 'Percent Correctly Classified'

        # first get the test/train AUC
        PCCtrainline = [string for string in content[ind1:ind] if substring in string][0]
        PCCtrainval = PCCtrainline[34:41]
        # add value to dictionary using model name as the key
        PCCtrainResults[model] = float(PCCtrainval)

        # then get the CV AUC
        PCCline = [string for string in content[ind:] if substring in string][0]
        # extract AUC value from that string, constant across models
        PCCval = PCCline[34:41]
        # add value to dictionary using model name as the key
        PCCresults[model] = float(PCCval)

        f.close()

    # print both dictionaries
    print('Percent correctly classified')
    PCCtrainResults['mean'] = np.array(list(PCCtrainResults.values())).mean()
    print('test/train', PCCtrainResults)

    PCCresults['mean'] = np.array(list(PCCresults.values())).mean()
    print('cross validation', PCCresults)

Percent correctly classified
test/train {'brt': 83.9056, 'glm': 77.0011, 'mars': 77.5754, 'rf': 80.4839, 'Maxent': 81.557, 'mean': 80.1046}
cross validation {'brt': 86.0443, 'glm': 76.7197, 'mars': 78.5774, 'rf': 89.4785, 'Maxent': 80.8722, 'mean': 82.33842000000001}


#### Save PCC results to an existing csv file

In [6]:
file = r'D:/1_AMLT/1_SDM/SAHM/PCC_Comparisons.csv'

with open(file, 'a') as f:
    # add the test/training PCC to csv
    for key in PCCtrainResults.keys():
        f.write(code + '_' + key + '_PCC_train' + ',')
    f.write('\n')
    for key in PCCtrainResults.keys():
        f.write(str(PCCtrainResults[key]) + ',')
    f.write('\n')
    
    # add the cross validation PCC to csv
    for key in PCCresults.keys():
        f.write(code + '_' + key + '_PCC_CV' + ',')
    f.write('\n')
    for key in PCCresults.keys():
        f.write(str(PCCresults[key]) + ',')
    f.write('\n')

#### Get Mean Variable Importance for each model

In [77]:
for code in codes: 
    # SAHM results folder
    results = r"D:/1_AMLT/1_SDM/SAHM/" + code + "/"

    # models = models[:1]
    model_dfs = []

    # Loop through all of the folders to extract the test/train and CV AUCs one at a time
    for model in models: 
        filepath = results + model + '_1/ExpandedOutput/VariableImportance.csv'
        # read csv into pandas df with axis renamed
        df = pd.read_csv(filepath).set_index('predictor').T.rename_axis(model).rename_axis(None, axis=1)
        # take the mean of each column (predictor)  and convert to percentages
        df_means = df.mean(axis = 0, skipna = True)*100
        # convert series to pandas df
        df_means = df_means.to_frame().rename(columns = {0: model})
        # add it to a list of dataframes
        model_dfs.append(df_means)

    # combine all of the dataframes into one ready for export
    all_means_df = pd.concat(model_dfs, axis=1, ignore_index=False).rename_axis(code)

    # add a column with the mean var importance and sort by highest mean
    all_means_df['Mean'] = all_means_df.mean(numeric_only=True, axis=1)
    all_means_df = all_means_df.sort_values(by='Mean', ascending=False)

    # reorder the columns to match alphabetical order like in chapter
    all_means_df = all_means_df[['brt', 'glm', 'mars', 'Maxent', 'rf', 'Mean']]
    display(all_means_df)
    
    # add to csv file
    file = r'D:/1_AMLT/1_SDM/SAHM/MeanVariableImportances.csv'
    all_means_df.to_csv(file, mode='a')

Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
QUKE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_3,21.168727,21.483091,24.124182,25.539273,12.094909,20.882036
bio_6,,5.196909,13.550182,7.898364,3.993636,7.659773
bio_13,12.289273,5.078,5.192727,10.797091,4.815273,7.634473
bio_15,,4.212727,2.611636,0.550364,2.305818,2.420136
bio_1_temp,,,4.716,1.281455,0.827273,2.274909
Slope,,,1.358727,1.626727,1.353091,1.446182
Elevation,,,0.629636,0.692182,1.609455,0.977091
Aspect,,,0.422364,0.265455,0.507273,0.398364
YearsSinceRxBurn,,,0.0,0.044182,0.042545,0.028909
