In [1]:
# 5/3/21
# Run Random Forest Subfile

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Import Random Forest Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, fbeta_score, precision_recall_curve

In [3]:
""" Primary function that takes input dataset and runs through all random forest functionality
Inputs: pandas dataframe, run name, text file name
Outputs: None, but stores results of run in textfile 
"""
def run_random_forest(df, X_train, X_test, y_train, y_test, runset_name, file_name):
    
    # Open text file and write runset name
    text_file=open(file_name,"a")
    text_file.write("Results for "+runset_name+" Random Forest run\n\n")
    
    # Print % of dataset that is frauds
    text_file.write('No Frauds: '+ str(round(y_train.value_counts()[0]/len(y_train) * 100,2))+ '% of the training set\n')
    text_file.write('Frauds: '+ str(round(y_train.value_counts()[1]/len(y_train) * 100,2))+ '% of the training set\n\n')
    
    # Split Data into test and train datasets
    # X values are V* and Amount and Y values are 1 for fraud and 0 for not fraud
    # Current Train dataset is 70% of the total dataset
    X=df.iloc[:,0:30]
    y=df.iloc[:,30]
    #X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3)
    
    # Create Gaussian Classifier
    # Temporarily setting to 25 to reduce runtime
    #clf=RandomForestClassifier(n_estimators=100)
    clf=RandomForestClassifier(n_estimators=25)
    
    #Train the model
    clf.fit(X_train,y_train)
    
    # store feature importance
    feature_imp=pd.Series(clf.feature_importances_,index=df.columns[0:30])
    feature_imp_sorted=feature_imp.sort_values(ascending=False)
    
    # Print feature importance
    text_file.write(str(feature_imp_sorted)+"\n")
    
    # Create predicted values
    y_pred=clf.predict(X_test)
    
    # Print confusion matrix
    text_file.write("Confusion Matrix\n")
    text_file.write(str(confusion_matrix(y_test,y_pred))+"\n\n")
    
    # Print classification report
    text_file.write("Classification Report\n")
    text_file.write(str(classification_report(y_test,y_pred))+"\n\n")
    
    # Print Accuracy Score
    text_file.write("Accuracy Score=")
    text_file.write(str(accuracy_score(y_test,y_pred))+"\n")
    
    # Print Precision Score
    text_file.write("Precision Score=")
    text_file.write(str(precision_score(y_test,y_pred))+'\n')
    
    # Print Recall Score
    text_file.write("Recall Score=")
    text_file.write(str(recall_score(y_test,y_pred))+'\n') 
    
    # Print F2 Score
    text_file.write("F2 Score=")
    text_file.write(str(fbeta_score(y_test,y_pred, beta=2))+'\n')    
    
    text_file.write("----------------------------------------------------------\n")
    # Close text file
    text_file.close()
    
    return feature_imp