In [1]:
# 5/9/21
# Run Random Forest Subfile

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Import Random Forest Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, fbeta_score, precision_recall_curve

In [3]:
# Import Generate Plots File
import import_ipynb
import GeneratePlots

importing Jupyter notebook from GeneratePlots.ipynb


In [4]:
""" Primary function that takes input dataset and runs through all random forest functionality
Inputs: pandas dataframe, X and y test and train sets, run name, text file name
Outputs: None, but stores results of run in textfile 
"""
def run_random_forest(df, X_train, X_test, y_train, y_test, runset_name, file_name):
    
    # Open text file and write runset name
    text_file=open(file_name,"a")
    text_file.write("Results for "+runset_name+" Random Forest run\n\n")
    
    # Print % of dataset that is frauds
    text_file.write('No Frauds: '+ str(round(y_train.value_counts()[0]/len(y_train) * 100,2))+ '% of the training set\n')
    text_file.write('Frauds: '+ str(round(y_train.value_counts()[1]/len(y_train) * 100,2))+ '% of the training set\n\n')
    
    # Split Data into test and train datasets
    # X values are V* and Amount and Y values are 1 for fraud and 0 for not fraud
    # Current Train dataset is 70% of the total dataset
    X=df.iloc[:,0:30]
    y=df.iloc[:,30]
    #X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3)
    
    # Create Gaussian Classifier
    clf=RandomForestClassifier(n_estimators=100)
    #clf=RandomForestClassifier(n_estimators=25)
    
    #Train the model
    clf.fit(X_train,y_train)
    
    # store feature importance
    feature_imp=pd.Series(clf.feature_importances_,index=df.columns[0:30])
    feature_imp_sorted=feature_imp.sort_values(ascending=False)
    
    # Print feature importance
    text_file.write(str(feature_imp_sorted)+"\n")
    
    # Create predicted values
    y_pred=clf.predict(X_test)
    
    # Print confusion matrix
    text_file.write("Confusion Matrix\n")
    text_file.write(str(confusion_matrix(y_test,y_pred))+"\n\n")
    
    # Print classification report
    text_file.write("Classification Report\n")
    text_file.write(str(classification_report(y_test,y_pred))+"\n\n")
    
    # Print Accuracy Score
    text_file.write("Accuracy Score=")
    text_file.write(str(accuracy_score(y_test,y_pred))+"\n")
    
    # Print Precision Score
    text_file.write("Precision Score=")
    text_file.write(str(precision_score(y_test,y_pred))+'\n')
    
    # Print Recall Score
    text_file.write("Recall Score=")
    text_file.write(str(recall_score(y_test,y_pred))+'\n') 
    
    # Print F2 Score
    text_file.write("F2 Score=")
    text_file.write(str(fbeta_score(y_test,y_pred, beta=2))+'\n')    
    
    text_file.write("----------------------------------------------------------\n")
    # Close text file
    text_file.close()
    
    return feature_imp,y_pred

In [5]:
"""Wrapper function that runs rull random forest analysis and plots for an input test train split files
Inputs: pandas dataframe, X and y train and test dataframes, run name, text file name
Outputs: None but stores 2 images in the plots folder and updates the text file name with the random forest run results
"""
def random_forest(df, X_train, X_test, y_train, y_test, runset_name, file_name):
    # Run of random forest model
    feature_imp,y_pred=run_random_forest(df, X_train, X_test, y_train, y_test,runset_name, file_name)
    
    # Generate Feature Importance Plot
    GeneratePlots.gen_feature_plot(df, feature_imp,runset_name)
    
    # Append training data to create heatmap
    df_train = X_train.copy()
    df_train.insert(0,'Class',y_train)

    # Generate heatmap of training data
    GeneratePlots.gen_heatmap_plot(df_train,runset_name)
    
    return y_pred

In [6]:
""" Function that creates a list of potential positive fraud cases to alert customers
Inputs: dataframe of test data, darray of fraud or no fraud declarations, runset name, text file name
Outputs: None but stores list of information in text file
"""

def print_fraud_cases(X_test,y_pred,runset_name,file_name):
    # Create new array of positive customer fraud cases
    fraud_positive = X_test.copy()
    fraud_positive.insert(0,'Class',y_pred)
    fraud_positive=fraud_positive[fraud_positive.Class!=0]
    #fraud_positive.drop(fraud_positive.iloc[:,3:30],axis=1)
    first_columns=fraud_positive.iloc[:,0:3]
    
    # Open text file and write runset name
    text_file=open(file_name,"a")
    text_file.write("Potential Fraud Cases for "+runset_name+" Random Forest run\n\n")
    
    # Write fraud cases to txt file
    text_file.write(first_columns.to_string())
        
    text_file.write("----------------------------------------------------------\n")
    # Close text file
    text_file.close()