Annie Taylor
2.21.22

## Extract AUC values from SAHM model output files

#### Import packages

In [73]:
import pandas as pd
import numpy as np
import os
import csv
from IPython.display import display

#### Set Parameters

In [74]:
# The species/model to investigate, can do one at a time or iterate over many species
codes = ['QUKE']
# codes = ['CHPO','QUKE','RUUR']

# models to iterate over, folders are automatically created by SAHM based on the models you run
# folders = ['brt_1', 'glm_1', 'mars_1', 'rf_1', 'Maxent_1']
models = ['brt', 'glm', 'mars', 'rf', 'Maxent']

#### Get AUC values for each model (test/train and cross validation runs) 

In [6]:
for code in codes: 
    # SAHM results folder
    results = r"D:/1_AMLT/1_SDM/SAHM/" + code + "/"
    # models = models[:1]
    AUCresults = {}
    AUCtrainResults = {}

    # Loop through all of the folders to extract the test/train and CV AUCs one at a time
    for model in models: 
        filepath = results + model + '_1/' + model + '_output.txt'
        f = open(filepath, 'r') # open in read mode
        content = f.readlines()
        # find the list index (line) with 'test/training split'
        ind1 = content.index('Evaluation Statistics applied to train split:\n')
        # find the list index (line) with 'crossValidation'
        ind = content.index('Evaluation Statistics applied to crossValidation split:\n')
        substring = 'AUC'

        # first get the test/train AUC
        AUCtrainline = [string for string in content[ind1:ind] if substring in string][0]
        AUCtrainval = AUCtrainline[34:41]
        # add value to dictionary using model name as the key
        AUCtrainResults[model] = float(AUCtrainval)

        # then get the CV AUC
        AUCline = [string for string in content[ind:] if substring in string][0]
        # extract AUC value from that string, constant across models
        AUCval = AUCline[34:41]
        # add value to dictionary using model name as the key
        AUCresults[model] = float(AUCval)

        f.close()

    # print both dictionaries
    print('AUC values')

    AUCtrainResults['mean'] = np.array(list(AUCtrainResults.values())).mean()
    print('test/train', AUCtrainResults)

    AUCresults['mean'] = np.array(list(AUCresults.values())).mean()
    print('cross validation', AUCresults)

AUC values
test/train {'brt': 0.8791, 'glm': 0.6998, 'mars': 0.7362, 'rf': 0.7371, 'Maxent': 0.7815, 'mean': 0.76674}
cross validation {'brt': 0.74522, 'glm': 0.68814, 'mars': 0.71511, 'rf': 0.73726, 'Maxent': 0.73976, 'mean': 0.725098}
AUC values
test/train {'brt': 0.8712, 'glm': 0.6355, 'mars': 0.7133, 'rf': 0.7264, 'Maxent': 0.7715, 'mean': 0.7435799999999999}
cross validation {'brt': 0.72831, 'glm': 0.63606, 'mars': 0.68032, 'rf': 0.73206, 'Maxent': 0.70853, 'mean': 0.6970560000000001}
AUC values
test/train {'brt': 0.9953, 'glm': 0.8401, 'mars': 0.9274, 'rf': 0.9578, 'Maxent': 0.9733, 'mean': 0.9387800000000001}
cross validation {'brt': 0.95476, 'glm': 0.82121, 'mars': 0.87561, 'rf': 0.9548, 'Maxent': 0.919, 'mean': 0.905076}
AUC values
test/train {'brt': 0.9255, 'glm': 0.8089, 'mars': 0.8265, 'rf': 0.8344, 'Maxent': 0.8595, 'mean': 0.85096}
cross validation {'brt': 0.83271, 'glm': 0.79845, 'mars': 0.80814, 'rf': 0.83516, 'Maxent': 0.82835, 'mean': 0.820562}
AUC values
test/train {

#### Write these results to a csv file for comparisons

In [6]:
file = r'D:/1_AMLT/1_SDM/SAHM/AUC_Comparisons.csv'

with open(file, 'a') as f:
    # add the test/training AUC to csv
    for key in AUCtrainResults.keys():
        f.write(code + '_' + key + '_AUC_train' + ',')
    f.write('\n')
    for key in AUCtrainResults.keys():
        f.write(str(AUCtrainResults[key]) + ',')
    f.write('\n')
    
    # add the cross validation PCC to csv
    for key in AUCresults.keys():
        f.write(code + '_' + key + '_AUC_CV' + ',')
    f.write('\n')
    for key in AUCresults.keys():
        f.write(str(AUCresults[key]) + ',')
    f.write('\n')

#### Get the percent correctly classified in test/train and CV runs

In [7]:
for code in codes: 
    # SAHM results folder
    results = r"D:/1_AMLT/1_SDM/SAHM/" + code + "/"
    # models = models[:1]
    PCCresults = {}
    PCCtrainResults = {}

    # Loop through all of the folders to extract the test/train and CV AUCs one at a time
    for model in models: 
        filepath = results + model + '_1/' + model + '_output.txt'
        f = open(filepath, 'r') # open in read mode
        content = f.readlines()
        # find the list index (line) with 'test/training split'
        ind1 = content.index('Evaluation Statistics applied to train split:\n')
        # find the list index (line) with 'crossValidation'
        ind = content.index('Evaluation Statistics applied to crossValidation split:\n')
        substring = 'Percent Correctly Classified'

        # first get the test/train AUC
        PCCtrainline = [string for string in content[ind1:ind] if substring in string][0]
        PCCtrainval = PCCtrainline[34:41]
        # add value to dictionary using model name as the key
        PCCtrainResults[model] = float(PCCtrainval)

        # then get the CV AUC
        PCCline = [string for string in content[ind:] if substring in string][0]
        # extract AUC value from that string, constant across models
        PCCval = PCCline[34:41]
        # add value to dictionary using model name as the key
        PCCresults[model] = float(PCCval)

        f.close()

    # print both dictionaries
    print('Percent correctly classified')
    PCCtrainResults['mean'] = np.array(list(PCCtrainResults.values())).mean()
    print('test/train', PCCtrainResults)

    PCCresults['mean'] = np.array(list(PCCresults.values())).mean()
    print('cross validation', PCCresults)

Percent correctly classified
test/train {'brt': 79.5549, 'glm': 66.9403, 'mars': 67.7853, 'rf': 66.857, 'Maxent': 70.8402, 'mean': 70.39554000000001}
cross validation {'brt': 77.6273, 'glm': 65.3815, 'mars': 67.5006, 'rf': 87.838, 'Maxent': 70.6144, 'mean': 73.79236}
Percent correctly classified
test/train {'brt': 78.7849, 'glm': 60.0714, 'mars': 65.3246, 'rf': 67.6116, 'Maxent': 70.8363, 'mean': 68.52575999999999}
cross validation {'brt': 76.6645, 'glm': 60.8331, 'mars': 65.9559, 'rf': 86.4325, 'Maxent': 69.8114, 'mean': 71.93948}
Percent correctly classified
test/train {'brt': 97.1611, 'glm': 76.8575, 'mars': 85.3739, 'rf': 90.9281, 'Maxent': 91.8158, 'mean': 88.42728}
cross validation {'brt': 96.3097, 'glm': 76.7836, 'mars': 81.8692, 'rf': 96.3096, 'Maxent': 89.2358, 'mean': 88.10158}
Percent correctly classified
test/train {'brt': 84.4463, 'glm': 73.8195, 'mars': 75.8121, 'rf': 76.8023, 'Maxent': 78.4541, 'mean': 77.86686}
cross validation {'brt': 83.3593, 'glm': 73.4814, 'mars': 7

#### Save PCC results to csv file

In [6]:
file = r'D:/1_AMLT/1_SDM/SAHM/PCC_Comparisons.csv'

with open(file, 'a') as f:
    # add the test/training PCC to csv
    for key in PCCtrainResults.keys():
        f.write(code + '_' + key + '_PCC_train' + ',')
    f.write('\n')
    for key in PCCtrainResults.keys():
        f.write(str(PCCtrainResults[key]) + ',')
    f.write('\n')
    
    # add the cross validation PCC to csv
    for key in PCCresults.keys():
        f.write(code + '_' + key + '_PCC_CV' + ',')
    f.write('\n')
    for key in PCCresults.keys():
        f.write(str(PCCresults[key]) + ',')
    f.write('\n')

#### Get Mean Variable Importance for each model

In [9]:
for code in codes: 
    # SAHM results folder
    results = r"D:/1_AMLT/1_SDM/SAHM/" + code + "/"

    # models = models[:1]
    model_dfs = []

    # Loop through all of the folders to extract the test/train and CV AUCs one at a time
    for model in models: 
        filepath = results + model + '_1/ExpandedOutput/VariableImportance.csv'
        # read csv into pandas df with axis renamed
        df = pd.read_csv(filepath).set_index('predictor').T.rename_axis(model).rename_axis(None, axis=1)
        # take the mean of each column (predictor)  and convert to percentages
        df_means = df.mean(axis = 0, skipna = True)*100
        # convert series to pandas df
        df_means = df_means.to_frame().rename(columns = {0: model})
        # add it to a list of dataframes
        model_dfs.append(df_means)

    # combine all of the dataframes into one ready for export
    all_means_df = pd.concat(model_dfs, axis=1, ignore_index=False).rename_axis(code)

    # add a column with the mean var importance and sort by highest mean
    all_means_df['Mean'] = all_means_df.mean(numeric_only=True, axis=1)
    all_means_df = all_means_df.sort_values(by='Mean', ascending=False)

    # reorder the columns to match alphabetical order like in chapter
    all_means_df = all_means_df[['brt', 'glm', 'mars', 'Maxent', 'rf', 'Mean']]
    display(all_means_df)
    
    # add to csv file
    file = r'D:/1_AMLT/1_SDM/SAHM/MeanVariableImportances.csv'
    all_means_df.to_csv(file, mode='a')

Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
ARDO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_6,10.924182,7.546364,6.851273,10.727273,7.312909,8.6724
bio_13,3.371636,0.984,9.284909,9.658727,3.541091,5.368073
Slope,3.816182,,3.728727,5.501273,2.906727,3.988227
bio_8,2.397091,,5.338727,3.431818,1.806545,3.243545
YearsSinceWildfire,,4.092182,1.233273,2.984909,1.332364,2.410682
bio_10,2.738364,,2.190727,2.552909,1.904364,2.346591
bio_15,2.156909,3.183455,1.035818,0.007818,1.832727,1.643345
Aspect,0.622727,,0.317273,0.713273,-0.094727,0.389636


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
CAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_3,12.564545,8.727091,11.641273,9.52,9.706,10.431782
Slope,4.039455,1.652727,3.578909,3.403636,3.595091,3.253964
bio_8,,,4.155455,2.669636,0.797091,2.540727
bio_1_temp,4.345455,,2.163455,0.846182,2.051818,2.351727
bio_13,2.007455,,0.932,4.364182,1.920727,2.306091
bio_15,1.821455,5.360364,-0.011818,0.646727,1.888,1.940945
bio_6,,,1.908182,0.434182,1.221818,1.188061
YearsSinceRxBurn,,0.547091,0.009273,0.362545,0.029455,0.237091


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
CHPO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_2,9.454727,,22.968364,21.838727,7.732545,15.498591
Elevation,4.837273,12.035818,9.156,14.611273,3.280364,8.784145
bio_3,5.982545,,10.614909,5.363273,5.132182,6.773227
bio_1_temp,2.784727,8.089636,6.813636,1.861455,3.653818,4.640655
bio_13,2.918,4.894364,3.439636,6.047455,2.505818,3.961055
Slope,,5.872727,3.628182,4.369636,1.352182,3.805682
bio_15,1.468182,,0.322,2.243091,2.670182,1.675864
YearsSinceWildfire,,2.253273,1.546545,1.100364,0.2418182,1.2855
YearsSinceRxBurn,,,,-0.022545,-1.090732e-15,-0.011273
Aspect,,,-0.021273,-0.024364,-0.1090909,-0.051576


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
CLDO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_13,10.520545,19.572364,12.117455,11.251091,7.531091,12.198509
bio_6,10.032545,4.842545,8.260182,7.869818,5.440364,7.289091
bio_10,6.629455,,5.563636,3.858182,4.385636,5.109227
bio_8,1.358727,4.423455,4.554182,2.099636,1.75,2.8372
Slope,1.250545,,2.471455,4.625636,2.045636,2.598318
YearsSinceWildfire,0.152545,3.654545,1.512909,1.269818,1.012182,1.5204
bio_15,,,0.462364,1.145636,2.272182,1.293394


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
COCO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_13,15.078545,21.320909,26.634364,20.377455,10.635455,18.809345
bio_7,12.664182,,8.252727,9.130182,7.206364,9.313364
bio_11,,12.164364,9.410182,1.693455,1.488909,6.189227
Slope,,8.178,2.004545,1.802909,1.373818,3.339818
bio_15,,,4.618727,2.299636,2.851636,3.256667
bio_1_temp,1.856909,,4.627273,3.849636,2.081091,3.103727
YearsSinceWildfire,,,0.080909,0.748727,0.023455,0.284364
YearsSinceRxBurn,,,-0.447818,-0.139818,-0.011091,-0.199576


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
QUKE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_3,21.168727,21.483091,24.124182,25.539273,12.094909,20.882036
bio_6,,5.196909,13.550182,7.898364,3.993636,7.659773
bio_13,12.289273,5.078,5.192727,10.797091,4.815273,7.634473
bio_15,,4.212727,2.611636,0.550364,2.305818,2.420136
bio_1_temp,,,4.716,1.281455,0.827273,2.274909
Slope,,,1.358727,1.626727,1.353091,1.446182
Elevation,,,0.629636,0.692182,1.609455,0.977091
Aspect,,,0.422364,0.265455,0.507273,0.398364
YearsSinceRxBurn,,,0.0,0.044182,0.042545,0.028909


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
RUPA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_5,15.241455,39.066182,21.463091,25.576909,7.970364,21.8636
bio_13,14.523636,22.320909,14.504364,19.935636,8.671091,15.991127
bio_1_temp,1.090182,11.320545,3.909818,6.460182,1.082909,4.772727
bio_11,0.718364,10.436727,0.648727,2.096,1.625091,3.104982
bio_15,0.609818,1.123273,1.282182,1.976909,1.881636,1.374764
Slope,,,2.019091,0.798909,0.321636,1.046545
YearsSinceWildfire,,,0.005455,1.537818,0.162909,0.568727
Curvature,0.402182,0.759818,0.452182,0.650364,-0.083273,0.436255


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
RUUR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_13,7.755455,16.855273,16.436727,13.722545,6.789273,12.311855
bio_6,15.158,0.706727,8.541091,10.442364,8.238,8.617236
bio_8,,9.168364,3.858182,5.389455,1.052364,4.867091
bio_10,3.892,,2.459091,4.881636,2.66,3.473182
Slope,2.642182,,1.413818,4.899273,1.135091,2.522591
bio_15,2.268909,,0.736364,1.597455,1.239455,1.460545


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
SACE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_10,12.223818,10.796727,12.958909,10.833818,10.225273,11.407709
bio_6,4.590364,6.162909,6.767091,4.749636,3.347636,5.123527
bio_15,5.901636,3.959636,4.099636,3.254545,4.122364,4.267564
YearsSinceWildfire,0.874727,3.976364,2.08,3.908364,1.890364,2.545964
bio_13,2.035455,0.711455,2.431636,5.151818,2.233273,2.512727
bio_8,1.520545,,1.591455,1.740727,0.559455,1.353045
Aspect,,,-0.117455,0.459091,-0.282545,0.019697


Unnamed: 0_level_0,brt,glm,mars,Maxent,rf,Mean
VAOV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bio_13,6.695273,23.327455,10.318727,8.174727,6.462909,10.995818
bio_5,8.253636,,14.407636,5.034182,5.000182,8.173909
bio_11,5.821455,10.167818,5.096182,5.1,4.636,6.164291
bio_1_temp,4.980909,,3.669818,3.734727,3.465455,3.962727
bio_15,,,1.101091,2.685091,1.834727,1.873636
Slope,0.238364,,1.570909,1.333273,0.392182,0.883682
YearsSinceWildfire,,,0.282364,0.629091,0.020727,0.310727


#### Save Variable Importance means to a csv 

In [75]:
file = r'D:/1_AMLT/1_SDM/SAHM/MeanVariableImportances.csv'

all_means_df.to_csv(file, mode='a')