# Random Forest

# Import Dependencies

In [6]:
import pandas as pd
import numpy as np

from config import CSV_FILE_URL
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier

# Read the CSV

In [None]:
# Read the CSV file into a dataframe
df = pd.read_csv(CSV_FILE_URL)
df.shape

# Data Cleaning¶
Checking to see if there are any null values in the dataset.

In [None]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.shape

In [None]:
# It could be seen that there are no null values in the dataset.

Data Exploration
Checking Data distribution w.r.t Target feature

In [None]:
df.groupby('Class').count()

In [None]:
grouped_df = df[['Class','Time']].groupby('Class').count()
grouped_df = grouped_df.rename(columns={"Time":"Count"})
grouped_df.plot(kind="bar")

#It is evident from the above plot that data is highly imbalanced. The dataset has only ___ fraudulent transactions out of a total of ____,____ transactions, which is a mere ____%.

Defining features
Defining the predictor features and the target feature.

In [None]:
df.columns

In [None]:
# Selecting predictor features. This will be used as x values.
selected_features = df[['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']]
selected_features.shape

In [None]:
# Defining predictor and target features to X and y respectively.
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

X = selected_features
y = df[['Class']].values.reshape(-1, 1)

print("Shape: ", X.shape, y.shape)

In [None]:
#Create a Train Test Split¶
Using Class for the y values (target feature)

In [None]:
# Using sklearn's `train_test_split` to split the data into training and testing datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
print("Shape: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
Data Pre-processing
Scale the data using the MinMaxScaler

In [None]:
X_minmax = MinMaxScaler().fit(X_train)

X_train_scaled = X_minmax.transform(X_train)
X_test_scaled = X_minmax.transform(X_test)

# Random Forest Classifier Model¶
Train the Model

In [None]:
randomTreeModel = RandomTreeClassifier(n_estimators=200)
randomTreeModel

In [None]:
randomTreeModel.fit(X_train_scaled, y_train.flatten())

In [None]:
print(f"Training Data Score: {randomTreeModel.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {randomTreeModel.score(X_test_scaled, y_test)}")

In [None]:
# Make predictions with the model
randomTreePredictions = randomTreeModel.predict(X_test_scaled)

In [None]:
randomTreePredictProba = randomTreeModel.predict_proba(X_test_scaled)[:,1]

In [None]:
print("Random Forest Classifier")
print("========================")
print("Accuracy: ",accuracy_score(y_test.flatten(), randomTreePredictions)) 
print("Precision: ",precision_score(y_test.flatten(), randomTreePredictions))
print("Recall: ",recall_score(y_test.flatten(), randomTreePredictions))
print("F1-Score: ",f1_score(y_test.flatten(), randomTreePredictions))
print("AUC score: ",roc_auc_score(y_test.flatten(), randomTreePredictions))
print(classification_report(y_test.flatten(), randomTreePredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))
p, r, t = precision_recall_curve(y_test.flatten(), randomTreePredictProba)
plt.plot(p, r)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve - Random Forest Classifier')
confusionMatrix = confusion_matrix(y_test.flatten(), randomTreePredictions) 
plt.figure(figsize =(6, 6)) 
sns.heatmap(confusionMatrix, xticklabels = ['Non Fraudulent', 'Fraudulent'],  
            yticklabels = ['Non Fraudulent', 'Fraudulent'], annot = True, fmt ="d"); 
plt.title("Confusion matrix - Random Forest Classifier") 
plt.ylabel('Actual Class') 
plt.xlabel('Predicted class') 
plt.show()

# Hyperparameter Tuning¶
Use GridSearchCV to tune the model's parameters

In [None]:
randomTreeModel.get_params().keys()

In [None]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
randomTreeParamGrid = {'n_estimators': [100, 200, 300, 400, 500],
                         'criterion': ['gini','entropy'],
                         'class_weight': [{0:1, 1:1, 2:2}, "balanced","balanced_subsample", None],
                         'max_features': ['auto','sqrt','log2',None],
                        }
randomTreeGrid = GridSearchCV(randomTreeModel, randomTreeParamGrid, verbose=3)

In [None]:
# Train the model with GridSearch
randomTreeGrid.fit(X_train_scaled, y_train.flatten())

In [None]:
print(randomTreeGrid.best_params_)
print(randomTreeGrid.best_score_)

In [None]:
# Make predictions with the hyperparameter tuned model
randomTreeGridPredictions = randomTreeGrid.predict(X_test_scaled)

In [None]:
# Print classification report
print(classification_report(y_test.flatten(), randomTreeGridPredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))

In [None]:
hypertunedrandomTreeModel = randomTreeClassifier(criterion = 'gini', max_depth = 8, splitter = 'best')
hypertunedrandomTreeModel

In [None]:
hypertunedrandomTreeTreeModel.fit(X_train_scaled, y_train.flatten())

In [None]:
print(f"Training Data Score: {hypertunedrandomTreeModel.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {hypertunedrandomTreeModel.score(X_test_scaled, y_test)}")

In [None]:
hypertunedrandomTreeModelPredictions = hypertunedrandomTreeModel.predict(X_test_scaled)

In [None]:
hypertunedrandomTreeModelPredictProba = hypertunedrandomTreeModel.predict_proba(X_test_scaled)[:,1]

In [None]:
# Calculate classification report
print(classification_report(y_test.flatten(), hypertunedrandomTreeModelPredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))

In [None]:
print("Random Forest Classifier")
print("========================")
print("Accuracy: ",accuracy_score(y_test.flatten(), randomTreePredictions)) 
print("Precision: ",precision_score(y_test.flatten(), randomTreePredictions))
print("Recall: ",recall_score(y_test.flatten(), randomTreePredictions))
print("F1-Score: ",f1_score(y_test.flatten(), randomTreePredictions))
print("AUC score: ",roc_auc_score(y_test.flatten(), randomTreePredictions))
print(classification_report(y_test.flatten(), randomTreePredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))
p, r, t = precision_recall_curve(y_test.flatten(), randomTreePredictProba)
plt.plot(p, r)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve - Random Forest Classifier')
confusionMatrix = confusion_matrix(y_test.flatten(), randomTreePredictions) 
plt.figure(figsize =(6, 6)) 
sns.heatmap(confusionMatrix, xticklabels = ['Non Fraudulent', 'Fraudulent'],  
            yticklabels = ['Non Fraudulent', 'Fraudulent'], annot = True, fmt ="d"); 
plt.title("Confusion matrix - Random Forest Classifier") 
plt.ylabel('Actual Class') 
plt.xlabel('Predicted class') 
plt.show()

# Feature Selection

In [None]:
feature_names = selected_features.columns
sorted(zip(hypertunedrandomTreeModel.feature_importances_, feature_names), reverse=True)

# Choosing Top features

In [None]:
# Set top features. This will be used as x values.
top_features = df[[ 
'V17',
'V14',
'V10',
'V12',
'V15',
'V27',
'V3',
'V16',
'V18',
'V7',
'V1',
'V24',
'V8',
'V4',
'V6',
'V26',
'V20',
'V5',
'V21',
'V19',
'V23',
]]

top_features.head(1)

# Create a Train Test Split¶
Use Class for the y values

In [None]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

top_X = top_features
top_y = df[['Class']].values.reshape(-1, 1)

print("Shape: ", top_X.shape, top_y.shape)

In [None]:
top_X_train, top_X_test, top_y_train, top_y_test = train_test_split(top_X, top_y, random_state=42)

In [None]:
print(top_X_train.shape,top_X_test.shape,top_y_train.shape, top_y_test.shape)

# Pre-processing
Scale the data using the MinMaxScaler

In [None]:
top_X_minmax = MinMaxScaler().fit(top_X_train)

top_X_train_scaled = top_X_minmax.transform(top_X_train)
top_X_test_scaled = top_X_minmax.transform(top_X_test)

# Decision Tree Model with Top features¶
Train the Model

In [None]:
top_randomTreeModel = RandomTreeClassifier()
top_randomTreeModel

In [None]:
top_randomTreeModel.fit(top_X_train_scaled, top_y_train.flatten())

In [None]:
print(f"Training Data Score: {top_randomTreeModel.score(top_X_train_scaled, top_y_train)}")
print(f"Testing Data Score: {top_randomTreeModel.score(top_X_test_scaled, top_y_test)}")

In [None]:
# Make predictions
top_randomTreePredictions = top_randomTreeModel.predict(top_X_test_scaled)

In [None]:
top_randomTreePredictProba = top_randomTreeModel.predict_proba(top_X_test_scaled)[:,1]

In [None]:
print(classification_report(top_y_test.flatten(), top_randomTreePredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))
print("Accuracy:", accuracy_score(top_y_test.flatten(), top_randomTreePredictions))

In [None]:
print("Random Tree Classifier")
print("========================")
print("Accuracy: ",accuracy_score(y_test.flatten(), hypertunedrandomTreeModelPredictions)) 
print("Precision: ",precision_score(y_test.flatten(), hypertunedrandomTreeModelPredictions))
print("Recall: ",recall_score(y_test.flatten(), hypertunedrandomTreeModelPredictions))
print("F1-Score: ",f1_score(y_test.flatten(), hypertunedrandomTreeModelPredictions))
print("AUC score: ",roc_auc_score(y_test.flatten(), hypertunedrandomTreeModelPredictions))
print(classification_report(y_test.flatten(), hypertunedrandomTreeModelPredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))

p, r, t = precision_recall_curve(y_test.flatten(), hypertunedrandomTreeModelPredictProba)
plt.plot(p, r)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve - Decision Tree Classifier')

confusionMatrix = confusion_matrix(y_test.flatten(), hypertunedrandomTreeModelPredictions) 
plt.figure(figsize =(6, 6)) 
sns.heatmap(confusionMatrix, xticklabels = ['Non Fraudulent', 'Fraudulent'],  
            yticklabels = ['Non Fraudulent', 'Fraudulent'], annot = True, fmt ="d"); 
plt.title("Confusion matrix - Decision Tree Classifier") 
plt.ylabel('Actual Class') 
plt.xlabel('Predicted class') 
plt.show()

# Hyperparameter Tuning
Use GridSearchCV to tune the model's parameters

In [None]:
top_decisionTreeModel.get_params().keys()

In [None]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
top_randomTreeParamGrid = {'criterion':['gini', 'entropy'],
                             'splitter': ['best','random'],
                             'max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
top_randomTreeGrid = GridSearchCV(top_randomTreeModel, top_randomTreeParamGrid, verbose=3)

In [None]:
# Train the model with GridSearch
top_randomTreeGrid.fit(top_X_train_scaled, top_y_train.flatten())

In [None]:
print(top_randomTreeGrid.best_params_)
print(top_randomTreeGrid.best_score_)

In [None]:
# Make predictions with the hyperparameter tuned model
top_randomTreeGridPredictions = top_randomTreeGrid.predict(top_X_test_scaled)

In [None]:
print(classification_report(top_y_test.flatten(), top_randomTreeGridPredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))

In [None]:
top_hypertunedrandomTreeModel = randomTreeClassifier(criterion = 'entropy', max_depth = 7, splitter = 'random')
top_hypertunedrandomTreeModel

In [None]:
top_hypertunedrandomTreeModel.fit(top_X_train_scaled, top_y_train.flatten())

In [None]:
print(f"Training Data Score: {top_hypertunedrandomTreeModel.score(top_X_train_scaled, top_y_train)}")
print(f"Testing Data Score: {top_hypertunedrandomTreeModel.score(top_X_test_scaled, top_y_test)}")

In [None]:
top_hypertunedrandomTreeModelPredictions = top_hypertunedrandomTreeModel.predict(top_X_test_scaled)

In [None]:
top_hypertunedrandomTreeModelPredictProba = top_hypertunedrandomTreeModel.predict_proba(top_X_test_scaled)[:,1]

In [None]:
print(classification_report(top_y_test.flatten(), top_hypertunedrandomTreeModelPredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))

In [None]:
print("Random Forest Classifier")
print("========================")
print("Accuracy: ",accuracy_score(y_test.flatten(), randomTreePredictions)) 
print("Precision: ",precision_score(y_test.flatten(), randomTreePredictions))
print("Recall: ",recall_score(y_test.flatten(), randomTreePredictions))
print("F1-Score: ",f1_score(y_test.flatten(), randomTreePredictions))
print("AUC score: ",roc_auc_score(y_test.flatten(), randomTreePredictions))
print(classification_report(y_test.flatten(), randomTreePredictions,
                            target_names=["Non Fraudulent", "Fraudulent"]))
p, r, t = precision_recall_curve(y_test.flatten(), randomTreePredictProba)
plt.plot(p, r)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve - Random Forest Classifier')
confusionMatrix = confusion_matrix(y_test.flatten(), randomTreePredictions) 
plt.figure(figsize =(6, 6)) 
sns.heatmap(confusionMatrix, xticklabels = ['Non Fraudulent', 'Fraudulent'],  
            yticklabels = ['Non Fraudulent', 'Fraudulent'], annot = True, fmt ="d"); 
plt.title("Confusion matrix - Random Forest Classifier") 
plt.ylabel('Actual Class') 
plt.xlabel('Predicted class') 
plt.show()

# Save the Model¶

In [None]:
filename = 'RandomTreeClassifierModel.sav'
joblib.dump(hypertunedRandomTreeModel, filename)