In [None]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Classifiers Train and Test
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2018-08-18
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation on a Google Cloud virtual machine with 64 vCPUs and 57.6 GB of CPU memory with an Ubuntu operating system (18.04 LTS).
# Description: "Classifiers Train and Test" trains a classification model to determine cover values of 0% from cover values greater than or equal to 1% using the presence and absence data in the training dataset. Subsequently, the training dataset is subsetted to include only the presence data. Two additional classifiers are trained to distinguish 1-10% from 11-100% and 1-25% from 26-100%.
# ---------------------------------------------------------------------------

This script runs the model train and test steps to output a model performance and variable importance report and classifier and threshold files that can be transferred to the predict script. The script is formatted as a Jupyter Notebook and is intended to be run on a Google Cloud virtual machine with 64 vCPUs and 57.6 GB of CPU memory with an Ubuntu operating system (18.04 LTS). The Random Forest classifier in this script is set to use 16 cores and may work inefficiently or not at all on a machine that has less than 64 cores. For information on generating inputs for this script or on setting up Google Cloud virtual machines, see the [project readme](https://github.com/accs-uaa/vegetation-cover-modeling).

In [None]:
# Import modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
print('All modules successfully imported.')

In [None]:
# Define user input variables
print('Enter root directory:')
root_folder = input()
print('Enter name of output folder:')
output_folder = input()
print('Enter name of input data csv file:')
input_data_name = input()
print('Enter name of output report file:')
output_report_name = input()
print('Enter name of taxon:')
taxon_name = input()
print('All user-defined variables input.')

In [None]:
# Define variable sets
predictor_all = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_1_ultraBlue', 'may_2_blue', 'may_3_green', 'may_4_red', 'may_5_nearInfrared', 'may_6_shortInfrared1', 'may_7_shortInfrared2', 'may_evi2', 'may_nbr', 'may_ndmi', 'may_ndsi', 'may_ndvi', 'may_ndwi', 'june_1_ultraBlue', 'june_2_blue', 'june_3_green', 'june_4_red', 'june_5_nearInfrared', 'june_6_shortInfrared1', 'june_7_shortInfrared2', 'june_evi2', 'june_nbr', 'june_ndmi', 'june_ndsi', 'june_ndvi', 'june_ndwi', 'july_1_ultraBlue', 'july_2_blue', 'july_3_green', 'july_4_red', 'july_5_nearInfrared', 'july_6_shortInfrared1', 'july_7_shortInfrared2', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi', 'august_1_ultraBlue', 'august_2_blue', 'august_3_green', 'august_4_red', 'august_5_nearInfrared', 'august_6_shortInfrared1', 'august_7_shortInfrared2', 'august_evi2', 'august_nbr', 'august_ndmi', 'august_ndsi', 'august_ndvi', 'august_ndwi', 'september_1_ultraBlue', 'september_2_blue', 'september_3_green', 'september_4_red', 'september_5_nearInfrared', 'september_6_shortInfrared1', 'september_7_shortInfrared2', 'september_evi2', 'september_nbr', 'september_ndmi', 'september_ndsi', 'september_ndvi', 'september_ndwi']
predictor_metrics = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_2_blue', 'may_evi2', 'may_nbr', 'may_ndmi', 'may_ndsi', 'may_ndvi', 'may_ndwi', 'june_2_blue', 'june_evi2', 'june_nbr', 'june_ndmi', 'june_ndsi', 'june_ndvi', 'june_ndwi', 'july_2_blue', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi', 'august_2_blue', 'august_evi2', 'august_nbr', 'august_ndmi', 'august_ndsi', 'august_ndvi', 'august_ndwi', 'september_2_blue', 'september_evi2', 'september_nbr', 'september_ndmi', 'september_ndsi', 'september_ndvi', 'september_ndwi']
predictor_midsummer = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'july_1_ultraBlue', 'july_2_blue', 'july_3_green', 'july_4_red', 'july_5_nearInfrared', 'july_6_shortInfrared1', 'july_7_shortInfrared2', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi']
predictor_raw = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_1_ultraBlue', 'may_2_blue', 'may_3_green', 'may_4_red', 'may_5_nearInfrared', 'may_6_shortInfrared1', 'may_7_shortInfrared2', 'june_1_ultraBlue', 'june_2_blue', 'june_3_green', 'june_4_red', 'june_5_nearInfrared', 'june_6_shortInfrared1', 'june_7_shortInfrared2', 'july_1_ultraBlue', 'july_2_blue', 'july_3_green', 'july_4_red', 'july_5_nearInfrared', 'july_6_shortInfrared1', 'july_7_shortInfrared2', 'august_1_ultraBlue', 'august_2_blue', 'august_3_green', 'august_4_red', 'august_5_nearInfrared', 'august_6_shortInfrared1', 'august_7_shortInfrared2', 'september_1_ultraBlue', 'september_2_blue', 'september_3_green', 'september_4_red', 'september_5_nearInfrared', 'september_6_shortInfrared1', 'september_7_shortInfrared2']
zero_variable = ['zero']
ten_variable = ['ten']
twentyfive_variable = ['twentyfive']
retain_variables = ['cover', 'project', 'siteID', 'siteCode', 'methodSurvey', 'methodCover', 'strata']
coordinates = ['POINT_X', 'POINT_Y']
all_variables = predictor_all + zero_variable + ten_variable + twentyfive_variable + retain_variables + coordinates
print('Variable sets loaded.')

In [None]:
# Define a function to plot Pearson correlation of predictor variables
def plotVariableCorrelation(X_train, outFile):
    # Calculate Pearson correlation coefficient between the predictor variables, where -1 is perfect negative correlation and 1 is perfect positive correlation
    correlation = X_train.astype('float64').corr()
    # Generate a mask for the upper triangle of plot
    mask = np.zeros_like(correlation, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    # Set up the matplotlib figure
    f, ax = plot.subplots(figsize=(11, 9))
    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    # Draw the heatmap with the mask and correct aspect ratio
    correlation_plot = sns.heatmap(correlation, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={'shrink': .5})
    correlation_figure = correlation_plot.get_figure()
    correlation_figure.savefig(outFile, bbox_inches='tight', dpi=300)
    # Clear plot workspace
    plot.clf()
    plot.close()

print('Function "plotVariableCorrelation" loaded.')

In [None]:
# Define a function to plot variable importances
def plotVariableImportances(inModel, x_train, outVariableFile):
    # Get numerical feature importances
    importances = list(inModel.feature_importances_)
    # List of tuples with variable and importance
    feature_list = list(x_train.columns)
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Initialize the plot and set figure size
    variable_figure = plot.figure()
    plot.style.use('fivethirtyeight')
    fig_size = plot.rcParams["figure.figsize"]
    fig_size[0] = 36
    fig_size[1] = 12
    plot.rcParams["figure.figsize"] = fig_size
    # Create list of x locations for plotting
    x_values = list(range(len(importances)))
    # Make a bar chart of the variable importances
    plot.bar(x_values, importances, orientation = 'vertical')
    # Tick labels for x axis
    plot.xticks(x_values, feature_list, rotation='vertical')
    # Axis labels and title
    plot.ylabel('Importance'); plot.xlabel('Variable'); plot.title('Variable Importances');
    # Export
    variable_figure.savefig(outVariableFile, bbox_inches="tight", dpi=300)
    # Clear plot workspace
    plot.clf()
    plot.close()
    
print('Function "plotVariableImportances" loaded.')

In [None]:
# Define a function to calculate performance metrics based on a specified threshold value
def thresholdMetrics(inIndex, inProbability, inValue, y_test):
    outThresholded = np.zeros(inIndex.shape)
    outThresholded[inIndex > inValue] = 1
    confusion_test = confusion_matrix(y_test, outThresholded)
    true_negative = confusion_test[0,0]
    false_negative = confusion_test[1,0]
    true_positive = confusion_test[1,1]
    false_positive = confusion_test[0,1]
    outSensitivity = true_positive / (true_positive + false_negative)
    outSpecificity = true_negative / (true_negative + false_positive)
    outAUC = roc_auc_score(y_test, inProbability)
    outAccuracy = (true_negative + true_positive) / (true_negative + false_positive + false_negative + true_positive)
    return (outThresholded, outSensitivity, outSpecificity, outAUC, outAccuracy)

print('Function "thresholdMetrics" loaded.')

In [None]:
# Define a function to fit a classifier using training data and determine a best classification threshold using the test data
def trainTestModel(X_train, y_train, X_test, y_test, variable):
    # Fit a random forest classifier to the training dataset
    rf_classify = RandomForestClassifier(n_estimators = 5000, criterion='entropy', max_features='log2', bootstrap = True, oob_score = True, n_jobs=1, class_weight = "balanced")
    rf_classify.fit(X_train, y_train)
    # Use the random forest classifier to predict probabilities for the test dataset
    test_prediction = rf_classify.predict_proba(X_test)
    # Convert the positive class probabilities to a list of probabilities
    test_probability = [p[1] for p in test_prediction]
    # Convert the postitive class probabilities to an index between 0 and 1000
    test_index = [int((p[1] * 1000) + 0.5) for p in test_prediction]
    # Iterate through numbers between 0 and 1000 to output a list of sensitivity and specificity values per threshold number
    i = 1
    test_index = np.asarray(test_index)
    sensitivity_list = []
    specificity_list = []
    while i < 1001:
        test_thresholded, sensitivity_test, specificity_test, auc_test, accuracy_test = thresholdMetrics(test_index, test_probability, i, y_test)
        sensitivity_list.append(sensitivity_test)
        specificity_list.append(specificity_test)
        i = i + 1
    # Calculate a list of absolute value of difference between sensitivity and specificity and find the optimal threshold
    difference_list = [a - b for a, b in zip(sensitivity_list, specificity_list)]
    value, threshold = min((value, threshold) for (threshold, value) in enumerate(difference_list) if value >= 0)
    # Calculate the prediction index to a binary 0 or 1 output using the optimal threshold
    test_thresholded, sensitivity_test, specificity_test, auc_test, accuracy_test = thresholdMetrics(test_index, test_probability, threshold, y_test)
    return [threshold, sensitivity_test, specificity_test, auc_test, accuracy_test, rf_classify.oob_score_]

print('Function "trainTestModel" loaded.')

In [None]:
# Define a function to cross-validate a classifier using 100 stratified shuffle splits
def crossValidateModel(inDF, predictors, response):
    # Define the predictor labels (X) and the response label (y) in the input dataframe
    X = inDF[predictors]
    y = np.array(inDF[response]).ravel()
    strata = np.array(inDF['strata']).ravel()
    # Create empty lists to store the results of successive test runs
    threshold_list = []
    sensitivity_list = []
    specificity_list = []
    auc_list = []
    accuracy_list = []
    oob_score_list = []
    # Conduct a classification run for each training-test split
    i = 1
    while i < 101:
        # Define the training and test partitions
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = None, shuffle = True, stratify = strata)
        # Train and test a classifier and output threshold and performance metrics
        threshold, sensitivity, specificity, auc, accuracy, oob_score = trainTestModel(X_train, y_train, X_test, y_test, zero_variable)
        # Append threshold and performance metrics to lists
        threshold_list.append(threshold)
        sensitivity_list.append(sensitivity)
        specificity_list.append(specificity)
        auc_list.append(auc)
        accuracy_list.append(accuracy)
        oob_score_list.append(oob_score)
        print('Model iteration ' + str(i) + ' out of 100 trained and tested...')
        i = i + 1
    return [threshold_list, sensitivity_list, specificity_list, auc_list, accuracy_list, oob_score_list]

print('Function "crossValidateModel" loaded.')

In [None]:
# Create a function to train and export a final classifier
def trainExportClassifier(inDF, predictors, response, outModel, outImportance):
    # Define the predictor labels (X) and the response label (y) in the input dataframe
    X = inDF[predictors]
    y = np.array(inDF[response]).ravel()
    # Fit a classifier to the input dataset
    rf_classify = RandomForestClassifier(n_estimators = 5000, bootstrap = True, oob_score = True, n_jobs=16, class_weight = "balanced")
    rf_classify.fit(X, y)
    # Save classifier to an external file
    joblib.dump(rf_classify, outModel)
    # Export a variable importance plot
    plotVariableImportances(rf_classify, X, outImportance)
    print('Final model trained and exported...')
    
print('Function "trainExportClassifier" loaded.')

In [None]:
# Define a function to output threshold to text file
def thresholdOut(inThresholdList, outThresholdFile):
    file = open(outThresholdFile, 'w')
    file.write(str(int(np.mean(inThresholdList) + 0.5)))
    file.close()

print('Function "thresholdOut" loaded.')

In [None]:
# Import input data from csv file
input_file = os.path.join(os.path.join(root_folder, "speciesData"), input_data_name)
input_df = pd.read_csv(input_file)
# Convert numerical data to integers
input_df[predictor_all + zero_variable + ten_variable + twentyfive_variable + ['cover'] + ['strata']] = input_df[predictor_all + zero_variable + ten_variable + twentyfive_variable + ['cover'] + ['strata']].astype(int)
print(input_df)

In [None]:
# Create a plots folder if it does not exist
plots_folder = os.path.join(output_folder, "plots")
if not os.path.exists(plots_folder):
    os.makedirs(plots_folder)
    print('Plots folder created.')
else:
    print('Plots folder already exists.')

In [None]:
# Set initial plot sizefig_size = plot.rcParams["figure.figsize"]
fig_size = plot.rcParams["figure.figsize"]
fig_size[0] = 16
fig_size[1] = 12
plot.rcParams["figure.figsize"] = fig_size
print('Plot size parameters configured.')

In [None]:
# Export a Pearson Correlation plot for the predictor variables
variableCorrelation = os.path.join(plots_folder, "variableCorrelation.png")
plotVariableCorrelation(input_df[predictor_all], variableCorrelation)
print('Pearson Correlation plot saved.')

In [None]:
# Set initial plot sizefig_size = plot.rcParams["figure.figsize"]
fig_size = plot.rcParams["figure.figsize"]
fig_size[0] = 36
fig_size[1] = 12
plot.rcParams["figure.figsize"] = fig_size
print('Plot size parameters configured.')

In [None]:
# Perform 100 train and test iterations of zero classifier
threshold_list_0, sensitivity_list_0, specificity_list_0, auc_list_0, accuracy_list_0, oob_score_list_0 = crossValidateModel(input_df, predictor_all, zero_variable)

# Train and export a final model using the full input data
model_0 = os.path.join(output_folder, 'classifier_0.joblib')
variableImportance_0 = os.path.join(plots_folder, 'variableImportance_0.png')
trainExportClassifier(input_df, predictor_all, zero_variable, model_0, variableImportance_0)

In [None]:
print(np.mean(auc_list_0))

In [None]:
# Subset the input dataframe to include only the presence data
subset_df = input_df[input_df['strata'] >= 1]
print(subset_df)

In [None]:
# Perform 100 train and test iterations of ten classifier
threshold_list_10, sensitivity_list_10, specificity_list_10, auc_list_10, accuracy_list_10, oob_score_list_10 = crossValidateModel(subset_df, predictor_all, ten_variable)

# Train and export a final model using the full input data
model_10 = os.path.join(output_folder, 'classifier_10.joblib')
variableImportance_10 = os.path.join(plots_folder, 'variableImportance_10.png')
trainExportClassifier(subset_df, predictor_all, ten_variable, model_10, variableImportance_10)

In [None]:
print(np.mean(auc_list_10))

In [None]:
# Perform 100 train and test iterations of twentyfive classifier
threshold_list_25, sensitivity_list_25, specificity_list_25, auc_list_25, accuracy_list_25, oob_score_list_25 = crossValidateModel(subset_df, predictor_all, twentyfive_variable)

# Train and export a final model using the full input data
model_25 = os.path.join(output_folder, 'classifier_25.joblib')
variableImportance_25 = os.path.join(plots_folder, 'variableImportance_25.png')
trainExportClassifier(subset_df, predictor_all, twentyfive_variable, model_25, variableImportance_25)

In [None]:
print(np.mean(auc_list_25))

In [None]:
# Write a text file for each classifier containing the threshold value that minimizes the absolute value difference between specificity and sensitivity
thresholdOut(threshold_list_0, os.path.join(output_folder, 'threshold_0.txt'))
thresholdOut(threshold_list_10, os.path.join(output_folder, 'threshold_10.txt'))
thresholdOut(threshold_list_25, os.path.join(output_folder, 'threshold_25.txt'))
print('Threshold files saved.')

In [None]:
# Write html text file
output_report = os.path.join(output_folder, output_report_name)
output_text = os.path.splitext(output_report)[0] + ".txt"
text_file = open(output_text, "w")
text_file.write("<html>\n")
text_file.write("<head>\n")
text_file.write("<meta http-equiv=\"pragma\" content=\"no-cache\">\n")
text_file.write("<meta http-equiv=\"Expires\" content=\"-1\">\n")
text_file.write("</head>\n")
text_file.write("<body>\n")
text_file.write("<div style=\"width:90%;max-width:1000px;margin-left:auto;margin-right:auto\">\n")
text_file.write("<h1 style=\"text-align:center;\">Classified Cover Modeling Performance for " + taxon_name + "</h1>\n")
text_file.write(r"<br>" + "\n")
text_file.write(r"<h2>Model Performance</h2>" + "\n")
text_file.write("<p>Model performance is measured by sensitivity, specificity, accuracy, and area under curve (auc) for each of the model component classifiers as calculated by averaging 100 iterations of stratified random train-test splits. Each component is a binary classifier that distinguishes between a break in cover. The breaks are coded as follows: the '0' classifier distinguishes cover values greater than 0%, the '10' classifier distinguishes between cover values greater than 10%, and the '25' classifier distinguishes between cover values greater than 25%. Each component was trained separately and a threshold that minimized the absolute value difference between sensitivity and specificity where sensitivity was greater than specificity was selected against 100 random stratitified independent partitions of test data. All model metrics except the bootstrap are relative to the independent partitions of test data.</p>\n")
text_file.write(r"<h3>Performance of '0' Classifier</h3>" + "\n")
text_file.write("<p><b>Sensitivity</b> of the '0' Classifier is <b>" + str(round(np.mean(sensitivity_list_0), 3)) + "</b></p>\n")
text_file.write("<p><b>Specificity</b> of the '0' Classifier is <b>" + str(round(np.mean(specificity_list_0), 3)) + "</b></p>\n")
text_file.write("<p>Overall <b>Accuracy</b> of the '0' Classifier is <b>" + str(round(np.mean(accuracy_list_0), 3)) + "</b></p>\n")
text_file.write("<p>The '0' Classifier <b>Out Of Bag Score</b> is <b>" + str(round(np.mean(oob_score_list_0), 3)) + "</b></p>\n")
text_file.write("<p><b>AUC value</b> of the '0' Classifier is <b>" + str(round(np.mean(auc_list_0), 3)) + "</b></p>\n")
text_file.write("<p>The Variable Importance plot for the '0' Classifier is shown below:</p>\n")
text_file.write("<a target='_blank' href='plots\\variableImportance_0.png'><img style='display:inline-block;max-width:1000px;width:100%;' src='plots\\variableImportance_0.png'></a>\n")
text_file.write(r"<h3>Performance of '10' Classifier</h3>" + "\n")
text_file.write("<p><b>Sensitivity</b> of the '10' Classifier is <b>" + str(round(np.mean(sensitivity_list_10), 3)) + "</b></p>\n")
text_file.write("<p><b>Specificity</b> of the '10' Classifier is <b>" + str(round(np.mean(specificity_list_10), 3)) + "</b></p>\n")
text_file.write("<p>Overall <b>Accuracy</b> of the '10' Classifier is <b>" + str(round(np.mean(accuracy_list_10), 3)) + "</b></p>\n")
text_file.write("<p>The '10' Classifier <b>Out Of Bag Score</b> is <b>" + str(round(np.mean(oob_score_list_10), 3)) + "</b></p>\n")
text_file.write("<p><b>AUC value</b> of the '10' Classifier is <b>" + str(round(np.mean(auc_list_10), 3)) + "</b></p>\n")
text_file.write("<p>The Variable Importances plot for the '10' Classifier is shown below:</p>\n")
text_file.write("<a target='_blank' href='plots\\variableImportance_10.png'><img style='display:inline-block;max-width:1000px;width:100%;' src='plots\\variableImportance_10.png'></a>\n")
text_file.write(r"<h3>Performance of '25' Classifier</h3>" + "\n")
text_file.write("<p><b>Sensitivity</b> of the '25' Classifier is <b>" + str(round(np.mean(sensitivity_list_25), 3)) + "</b></p>\n")
text_file.write("<p><b>Specificity</b> of the '25' Classifier is <b>" + str(round(np.mean(specificity_list_25), 3)) + "</b></p>\n")
text_file.write("<p>Overall <b>Accuracy</b> of the '25' Classifier is <b>" + str(round(np.mean(accuracy_list_25), 3)) + "</b></p>\n")
text_file.write("<p>The '25' Classifier <b>Out Of Bag Score</b> is <b>" + str(round(np.mean(oob_score_list_25), 3)) + "</b></p>\n")
text_file.write("<p><b>AUC value</b> of the '25' Classifier is <b>" + str(round(np.mean(auc_list_25), 3)) + "</b></p>\n")
text_file.write("<p>The Variable Importances plot for the '25' Classifier is shown below:</p>\n")
text_file.write("<a target='_blank' href='plots\\variableImportance_25.png'><img style='display:inline-block;max-width:1000px;width:100%;' src='plots\\variableImportance_25.png'></a>\n")
text_file.write(r"<h2>Variable Correlation</h2>" + "\n")
text_file.write("<p>The plot below explores variable correlation. No attempt was made to remove highly correlated variables (shown in the plot dark blue).</p>\n")
text_file.write("<a target='_blank' href='plots\\variableCorrelation.png'><img style='display:inline-block;width:100%;' src='plots\\variableCorrelation.png'></a>\n")
text_file.write("</div>\n")
text_file.write("</body>\n")
text_file.write("</html>\n")
text_file.close()

# Rename HTML Text to HTML
if os.path.exists(output_report) == True:
    os.remove(output_report)
os.rename(output_text, output_report)
print('Report saved. Script complete.')