## Credit Card Fraud Detection

In this project you will predict fraudulent credit card transactions with the help of Machine learning models. Please import the following libraries to get started.

# 1. Initial Basic Steps
1. Importing necessary libraries
2. Loading data
3. Observe basic structure of data

## 1.1 Import necessary libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import metrics
from sklearn import preprocessing
import time
from pprint import pprint

## 1.2 Load data and observe basic structure

In [None]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
df.head()

In [None]:
#observe the different feature type present in the data
df.info()

Here we will observe the distribution of our classes

# 2. Exploratory data analysis

In [None]:
classes=df['Class'].value_counts()
normal_share=classes[0]/df['Class'].count()*100
fraud_share=classes[1]/df['Class'].count()*100

In [None]:
# Create a bar plot for the number and percentage of fraudulent vs non-fraudulent transcations
fig, axs = plt.subplots(1,2)
axs[0].bar(x=["normal_share", "fraud_share"], height=classes)
axs[0].set_title("Counts")
axs[1].bar(x=["normal_share", "fraud_share"], height=[normal_share, fraud_share])
axs[1].set_title("Percentage")

In [None]:
# Create a scatter plot to observe the distribution of classes with time
plt.figure(figsize=(15,5))
plt.scatter(df.Time, df.Class)


In [None]:
# Create a scatter plot to observe the distribution of classes with Amount
plt.figure(figsize=(15,5))
plt.scatter(df.Amount, df.Class)

In [None]:
# Drop unnecessary columns
df=df.drop("Time", axis=1)

### Splitting the data into train & test data

In [None]:
y=df.Class #class variable
X=df.drop("Class",axis=1)

In [None]:
from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y, random_state=100)

##### Preserve X_test & y_test to evaluate on the test data once you build the model

In [None]:
print(np.sum(y))
print(np.sum(y_train))
print(np.sum(y_test))

### Plotting the distribution of a variable

In [None]:
# plot the histogram of a variable from the dataset to see the skewness
fig,axs=plt.subplots(6,5)
for i in range(6):
    for j in range(5):
        try:
            axs[i,j].hist(X_train[X_train.columns[5*i+j]], bins=100)
            axs[i,j].set_title(X_train.columns[5*i+j])
        except:
            pass
fig.set_size_inches(22,24)

`We can see that there are many variables which have very high skewness so lets find the list of variable which have more than 0.5 skew and less than -0.5 skew`

In [None]:
# See the features with more than 0.5 or less that -0.5 skew
skew=X_train.skew()
# Take absolute value of skew and then get all the columns whose absolute value of skew is more than 0.5
skewed=skew[np.abs(skew)>0.5]
print(skewed)
print("The total number of features with skewness more than 0.5 or less than -0.5 are ", len(skewed))

### If there is skewness present in the distribution use:
- <b>Power Transformer</b> package present in the <b>preprocessing library provided by sklearn</b> to make distribution more gaussian

In [None]:
# - Apply : preprocessing.PowerTransformer(copy=False) to fit & transform the train & test data
power_trans=preprocessing.PowerTransformer(copy=False)
power_trans.fit_transform(X_train)
power_trans.transform(X_test)

In [None]:
# plot the histogram of a variable from the dataset again to see the result 
fig,axs=plt.subplots(6,5)
for i in range(6):
    for j in range(5):
        try:
            axs[i,j].hist(X_train[X_train.columns[5*i+j]], bins=100)
            axs[i,j].set_title(X_train.columns[5*i+j])
        except:
            pass
fig.set_size_inches(22,24)

In [None]:
# Now again See the features with more than 0.5 or less that -0.5 skew after power transform
skew=X_train.skew()
# Take absolute value of skew and then get all the columns whose absolute value of skew is more than 0.5
skewed=skew[np.abs(skew)>0.5]
print(skewed)
print("The total number of features with skewness more than 0.5 or less than -0.5 are ", len(skewed))

# 3. Model Building
- Build different models on the imbalanced dataset and see the result

## 3.1 Logistic regression

In [None]:
# Logistic Regression
from sklearn import linear_model #import the package

#### perfom cross validation on the X_train & y_train to create:
- X_train_cv
- X_test_cv 
- y_train_cv
- y_test_cv 

### 3.1.1 Cross validation done manually

In [None]:
#perform cross validation manually
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
accuracy_scores =[]
recall_scores = []
precission_scores = []
AUC_ROC=[]
logistic_model=linear_model.LogisticRegression( random_state=100)
for train_index, test_index in skf.split(X_train,y_train):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    logistic_model.fit(X_train_cv, y_train_cv)
    predictions = logistic_model.predict(X_test_cv)
    pred_proba= logistic_model.predict_proba(X_test_cv)[:,1]
    accuracy_scores.append(metrics.accuracy_score(y_test_cv, predictions))
    recall_scores.append(metrics.recall_score(y_test_cv, predictions))
    precission_scores.append(metrics.precision_score(y_test_cv, predictions))
    AUC_ROC.append(metrics.roc_auc_score(y_test_cv,pred_proba))
print("The average accurcay score is", np.mean(accuracy_scores))
print("The average recall score is", np.mean(recall_scores))
print("The average precision score is", np.mean(precission_scores))
print("The average ROC AUC score is", np.mean(AUC_ROC))

### 3.1.2 Hyperparameter Tunning

In [None]:
start=time.time()
#perform hyperparameter tuning
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
logistic_model=linear_model.LogisticRegression(random_state=100, max_iter=1000)
# define grid search
params = dict(solver=solvers,penalty=penalty,C=c_values)
# The best evaluation metric to evealuate the strength of a model is auc roc score
grid_search = model_selection.GridSearchCV(estimator=logistic_model, param_grid=params, n_jobs=-1, cv=skf, scoring='roc_auc', error_score=0)
grid_result = grid_search.fit(X_train, y_train)
end=time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
grid_result.best_estimator_

In [None]:
# See best score
grid_result.best_score_

In [None]:
#print the optimum value of hyperparameters
print(grid_result.best_params_)

### 3.1.3 Logistic regression model using best params

In [None]:
# Now train the model using these optimum hyperparametrs
logistic_model=linear_model.LogisticRegression(penalty='l2', C=0.01, random_state=100, solver='liblinear',max_iter=1000)
logistic_model.fit(X_train,y_train)
# Find the probability of the target to be 1
predict_proba= logistic_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_proba)
# Plot the ROC curve to see which value of tpr and FPR will be a good option to choose
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")

From the plot we can see that 0.85 will be a good value for TPR

In [None]:
# Plot graph between tpr and thresholds to choose the threshold
plt.plot(thresholds, tpr)
plt.xlabel("Thresholds")
plt.ylabel("TPR")
plt.title("TPR vs Thresholds")

From the above graph, the value of threshold correspond to TPR =0.85 should be around 0.06 

In [None]:
# By setting the threshold to be 0.06 the classes of the target will be
# Prediction for train data
y_train_pred=logistic_model.predict_proba(X_train)[:,1]>0.07
# Prediction for test data
y_pred=predict_proba>0.07

### 3.1.4 Logistic regression Model evaluation

In [None]:
print("The recall score for the train data is: ", metrics.recall_score(y_train, y_train_pred))
print("The precision score for the train data is: ", metrics.precision_score(y_train, y_train_pred))
print("The recall score for the test data is: ", metrics.recall_score(y_test, y_pred))
print("The precision score for the test data is: ", metrics.precision_score(y_test, y_pred))

## 3.2 Random Forest

In [None]:
# Import Random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# create RandomForestClassifier Object
rf = RandomForestClassifier()
# See the hyperparameters of Random forest classifier
pprint(rf.get_params())

### 3.2.1 First try Random Grid Search to get a idea about the range of hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt", "log2"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion for quality of a split
criterion =["gini", "entropy"]

# Create Param grid
grid_params={"n_estimators" : n_estimators, "criterion" : criterion, "max_depth" : max_depth, "min_samples_split" : min_samples_split, "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
pprint(grid_params)

In [None]:
start = time.time()
# Create StratifiedK fold object
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True ,random_state=100)
# Create RandomForestClassifier Object
rf = RandomForestClassifier(random_state=100)
# Creat Randomised grid search object
random_grid_search = model_selection.RandomizedSearchCV(estimator=rf, param_distributions=grid_params, n_iter=20, scoring='roc_auc', n_jobs=-1, cv=skf, random_state=100, error_score=0)
# Fit the model on train data
random_grid_search.fit(X_train, y_train)

end= time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
print(random_grid_search.best_estimator_)

In [None]:
# See the best score
print(random_grid_search.best_score_)

In [None]:
# See the best params from random grid search
print(random_grid_search.best_params_)

### 3.2.2 Grid Search to find the best hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt", "log2"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion for quality of a split
criterion =["gini", "entropy"]

# Create Param grid
grid_params={"n_estimators" : n_estimators, "criterion" : criterion, "max_depth" : max_depth, "min_samples_split" : min_samples_split, "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
pprint(grid_params)

In [None]:
start = time.time()
# Create StratifiedK fold object
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True ,random_state=100)
# Create RandomForestClassifier Object
rf = RandomForestClassifier(random_state=100)
# Creat grid search object
grid_search = model_selection.GridSearchCV(estimator=rf, param_distributions=grid_params, scoring='roc_auc', n_jobs=-1, cv=skf, random_state=100, error_score=0)
# Fit the model on train data
grid_search.fit(X_train, y_train)

end= time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
print(grid_search.best_estimator_)

In [None]:
# See the best score
print(grid_search.best_score_)

In [None]:
# See the best params from random grid search
print(grid_search.best_params_)

### 3.2.3 Random Forest using best params

In [None]:
# Now train the model using these optimum hyperparametrs
rf = RandomForestClassifier(random_state=100)
rf.fit(X_train,y_train)
# Find the probability of the target to be 1
predict_proba= rf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_proba)
# Plot the ROC curve to see which value of tpr and FPR will be a good option to choose
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")

From the above graph we can see that 

In [None]:
# Plot graph between tpr and thresholds to choose the threshold
plt.plot(thresholds, tpr)
plt.xlabel("Thresholds")
plt.ylabel("TPR")
plt.title("TPR vs Thresholds")

From the above graph, the value of threshold correspond to TPR =0.85 should be around 0.06 

In [None]:
# By setting the threshold to be 0.06 the classes of the target will be
# Prediction for train data
y_train_pred=rf.predict_proba(X_train)[:,1]>0.07
# Prediction for test data
y_pred=predict_proba>0.07

### 3.2.4 Random Forest Model evaluation

In [None]:
print("The recall score for the train data is: ", metrics.recall_score(y_train, y_train_pred))
print("The precision score for the train data is: ", metrics.precision_score(y_train, y_train_pred))
print("The recall score for the test data is: ", metrics.recall_score(y_test, y_pred))
print("The precision score for the test data is: ", metrics.precision_score(y_test, y_pred))

#### Proceed with the model which shows the best result 
- Apply the best hyperparameter on the model
- Predict on the test dataset

In [None]:
clf = ___  #initialise the model with optimum hyperparameters
clf.fit(X_train, y_train)
print --> #print the evaluation score on the X_test by choosing the best evaluation metric

### Print the important features of the best model to understand the dataset
- This will not give much explanation on the already transformed dataset
- But it will help us in understanding if the dataset is not PCA transformed

In [None]:
var_imp = []
for i in clf.feature_importances_:
    var_imp.append(i)
print('Top var =', var_imp.index(np.sort(clf.feature_importances_)[-1])+1)
print('2nd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-2])+1)
print('3rd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-3])+1)

# Variable on Index-16 and Index-13 seems to be the top 2 variables
top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-1])
second_top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-2])

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]

np.random.shuffle(X_train_0)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 20]

plt.scatter(X_train_1[:, top_var_index], X_train_1[:, second_top_var_index], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], top_var_index], X_train_0[:X_train_1.shape[0], second_top_var_index],
            label='Actual Class-0 Examples')
plt.legend()

## Model building with balancing Classes

##### Perform class balancing with :
- Random Oversampling
- SMOTE
- ADASYN

## Model Building
- Build different models on the balanced dataset and see the result

### Random Oversampling

In [None]:
from imblearn import over_sampling #- import the packages

In [None]:
#balance classes on X_train_cv & y_train_cv using Random Oversampling
ro = over_sampling.RandomOverSampler(random_state=100)
X_train_ro, y_train_ro = ro.fit_resample(X_train, y_train)

In [None]:
# Now lets see the class distribution of oversampled data
classes=y_train_ro.value_counts()
normal_share=classes[0]/len(y_train_ro)*100
fraud_share=classes[1]/len(y_train_ro)*100
# Create a bar plot for the number and percentage of fraudulent vs non-fraudulent transcations
fig, axs = plt.subplots(1,2)
axs[0].bar(x=["normal_share", "fraud_share"], height=classes)
axs[0].set_title("Counts")
axs[1].bar(x=["normal_share", "fraud_share"], height=[normal_share, fraud_share])
axs[1].set_title("Percentage")

So our data is oversampled properly.

In [None]:
start=time.time()
#perform hyperparameter tuning
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [ 100, 10, 0.1, 0.01]
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
logistic_model=linear_model.LogisticRegression(random_state=100, max_iter=1000)
# define grid search
params = dict(solver=solvers,penalty=penalty,C=c_values)
# The best evaluation metric to evealuate the strength of a model is auc roc score
grid_search = model_selection.GridSearchCV(estimator=logistic_model, param_grid=params, n_jobs=-1, cv=skf, scoring='roc_auc', error_score=0)
grid_result = grid_search.fit(X_train_ro, y_train_ro)
end=time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
grid_result.best_estimator_

In [None]:
# See best score
grid_result.best_score_

In [None]:
#print the optimum value of hyperparameters
print(grid_result.best_params_)

In [None]:
# Now train the model using these optimum hyperparametrs
logistic_model=linear_model.LogisticRegression(penalty='l2', C=100, random_state=100, solver='newton-cg',max_iter=1000)
logistic_model.fit(X_train_ro,y_train_ro)
# Find the probability of the target to be 1
predict_proba= logistic_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_proba)
# Plot the ROC curve to see which value of tpr and FPR will be a good option to choose
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")

From the plot we can see that 0.85 will be a good value for TPR

In [None]:
# Plot graph between tpr and thresholds to choose the threshold
plt.plot(thresholds, tpr)
plt.xlabel("Thresholds")
plt.ylabel("TPR")
plt.title("TPR vs Thresholds")

From the above graph, the value of threshold correspond to TPR =0.85 should be around 0.06 

In [None]:
# By setting the threshold to be 0.999 the classes of the target will be
# Prediction for train data
y_train_pred=logistic_model.predict_proba(X_train_ro)[:,1]>0.999
# Prediction for test data
y_pred=predict_proba>0.999

In [None]:
print("The recall score for the train data is: ", metrics.recall_score(y_train_ro, y_train_pred))
print("The precision score for the train data is: ", metrics.precision_score(y_train_ro, y_train_pred))
print("The recall score for the test data is: ", metrics.recall_score(y_test, y_pred))
print("The precision score for the test data is: ", metrics.precision_score(y_test, y_pred))

## 3.2 Random Forest with Random OverSampling

In [None]:
# Import Random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# create RandomForestClassifier Object
rf = RandomForestClassifier()
# See the hyperparameters of Random forest classifier
pprint(rf.get_params())

### 3.2.1 First try Random Grid Search to get a idea about the range of hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt", "log2"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion for quality of a split
criterion =["gini", "entropy"]

# Create Param grid
grid_params={"n_estimators" : n_estimators, "criterion" : criterion, "max_depth" : max_depth, "min_samples_split" : min_samples_split, "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
pprint(grid_params)

In [None]:
start = time.time()
# Create StratifiedK fold object
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True ,random_state=100)
# Create RandomForestClassifier Object
rf = RandomForestClassifier(random_state=100)
# Creat Randomised grid search object
random_grid_search = model_selection.RandomizedSearchCV(estimator=rf, param_distributions=grid_params, n_iter=20, scoring='roc_auc', n_jobs=-1, cv=skf, random_state=100, error_score=0)
# Fit the model on train data
random_grid_search.fit(X_train_ro, y_train_ro)

end= time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
print(random_grid_search.best_estimator_)

In [None]:
# See the best score
print(random_grid_search.best_score_)

In [None]:
# See the best params from random grid search
print(random_grid_search.best_params_)

### 3.2.2 Grid Search to find the best hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt", "log2"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion for quality of a split
criterion =["gini", "entropy"]

# Create Param grid
grid_params={"n_estimators" : n_estimators, "criterion" : criterion, "max_depth" : max_depth, "min_samples_split" : min_samples_split, "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
pprint(grid_params)

In [None]:
start = time.time()
# Create StratifiedK fold object
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True ,random_state=100)
# Create RandomForestClassifier Object
rf = RandomForestClassifier(random_state=100)
# Creat grid search object
grid_search = model_selection.GridSearchCV(estimator=rf, param_distributions=grid_params, scoring='roc_auc', n_jobs=-1, cv=skf, random_state=100, error_score=0)
# Fit the model on train data
grid_search.fit(X_train_ro, y_train_ro)

end= time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
print(grid_search.best_estimator_)

In [None]:
# See the best score
print(grid_search.best_score_)

In [None]:
# See the best params from random grid search
print(grid_search.best_params_)

### 3.2.3 Random Forest using best params

In [None]:
# Now train the model using these optimum hyperparametrs
rf = RandomForestClassifier(random_state=100)
rf.fit(X_train_ro,y_train_ro)
# Find the probability of the target to be 1
predict_proba= rf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_proba)
# Plot the ROC curve to see which value of tpr and FPR will be a good option to choose
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")

From the above graph we can see that 

In [None]:
# Plot graph between tpr and thresholds to choose the threshold
plt.plot(thresholds, tpr)
plt.xlabel("Thresholds")
plt.ylabel("TPR")
plt.title("TPR vs Thresholds")

From the above graph, the value of threshold correspond to TPR =0.85 should be around 0.06 

In [None]:
# By setting the threshold to be 0.06 the classes of the target will be
# Prediction for train data
y_train_pred=rf.predict_proba(X_train_ro)[:,1]>0.07
# Prediction for test data
y_pred=predict_proba>0.07

### 3.2.4 Random Forest Model evaluation

In [None]:
print("The recall score for the train data is: ", metrics.recall_score(y_train_ro, y_train_pred))
print("The precision score for the train data is: ", metrics.precision_score(y_train_ro, y_train_pred))
print("The recall score for the test data is: ", metrics.recall_score(y_test, y_pred))
print("The precision score for the test data is: ", metrics.precision_score(y_test, y_pred))

### Similarly explore other algorithms on balanced dataset by building models like:
- KNN
- SVM
- Decision Tree
- Random Forest
- XGBoost

### SMOTE

### Print the class distribution after applying SMOTE 

In [None]:
import warnings
warnings.filterwarnings("ignore")


sm = over_sampling.SMOTE(random_state=0)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
# Artificial minority samples and corresponding minority labels from SMOTE are appended
# below X_train and y_train respectively
# So to exclusively get the artificial minority samples from SMOTE, we do
X_train_smote_1 = X_train_smote[X_train.shape[0]:].to_numpy()

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]



fig = plt.figure(figsize=[20,20])

plt.subplot(3, 1, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_smote_1[:X_train_1.shape[0], 0], X_train_smote_1[:X_train_1.shape[0], 1],
            label='Artificial SMOTE Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

In [None]:
# Now lets see the class distribution of oversampled data
classes=y_train_smote.value_counts()
normal_share=classes[0]/len(y_train_smote)*100
fraud_share=classes[1]/len(y_train_smote)*100
# Create a bar plot for the number and percentage of fraudulent vs non-fraudulent transcations
fig, axs = plt.subplots(1,2)
axs[0].bar(x=["normal_share", "fraud_share"], height=classes)
axs[0].set_title("Counts")
axs[1].bar(x=["normal_share", "fraud_share"], height=[normal_share, fraud_share])
axs[1].set_title("Percentage")

### Logistic regression with SMOTE

In [None]:
start=time.time()
#perform hyperparameter tuning
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
logistic_model=linear_model.LogisticRegression(random_state=100, max_iter=1000)
# define grid search
params = dict(solver=solvers,penalty=penalty,C=c_values)
# The best evaluation metric to evealuate the strength of a model is auc roc score
grid_search = model_selection.GridSearchCV(estimator=logistic_model, param_grid=params, n_jobs=-1, cv=skf, scoring='roc_auc', error_score=0)
grid_result = grid_search.fit(X_train_smote, y_train_smote)
end=time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
grid_result.best_estimator_

In [None]:
# See best score
grid_result.best_score_

In [None]:
#print the optimum value of hyperparameters
print(grid_result.best_params_)

In [None]:
# Now train the model using these optimum hyperparametrs
logistic_model=linear_model.LogisticRegression(penalty='l2', C=100, random_state=100, solver='newton-cg',max_iter=1000)
logistic_model.fit(X_train_smote,y_train_smote)
# Find the probability of the target to be 1
predict_proba= logistic_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_proba)
# Plot the ROC curve to see which value of tpr and FPR will be a good option to choose
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")

From the plot we can see that 0.85 will be a good value for TPR

In [None]:
# Plot graph between tpr and thresholds to choose the threshold
plt.plot(thresholds, tpr)
plt.xlabel("Thresholds")
plt.ylabel("TPR")
plt.title("TPR vs Thresholds")

From the above graph, the value of threshold correspond to TPR =0.85 should be around 0.06 

In [None]:
# By setting the threshold to be 0.999 the classes of the target will be
# Prediction for train data
y_train_pred=logistic_model.predict_proba(X_train_smote)[:,1]>0.999
# Prediction for test data
y_pred=predict_proba>0.999

In [None]:
print("The recall score for the train data is: ", metrics.recall_score(y_train_smote, y_train_pred))
print("The precision score for the train data is: ", metrics.precision_score(y_train_smote, y_train_pred))
print("The recall score for the test data is: ", metrics.recall_score(y_test, y_pred))
print("The precision score for the test data is: ", metrics.precision_score(y_test, y_pred))

## 3.2 Random Forest with SMOTE

In [None]:
# Import Random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# create RandomForestClassifier Object
rf = RandomForestClassifier()
# See the hyperparameters of Random forest classifier
pprint(rf.get_params())

### 3.2.1 First try Random Grid Search to get a idea about the range of hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt", "log2"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion for quality of a split
criterion =["gini", "entropy"]

# Create Param grid
grid_params={"n_estimators" : n_estimators, "criterion" : criterion, "max_depth" : max_depth, "min_samples_split" : min_samples_split, "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
pprint(grid_params)

In [None]:
start = time.time()
# Create StratifiedK fold object
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True ,random_state=100)
# Create RandomForestClassifier Object
rf = RandomForestClassifier(random_state=100)
# Creat Randomised grid search object
random_grid_search = model_selection.RandomizedSearchCV(estimator=rf, param_distributions=grid_params, n_iter=20, scoring='roc_auc', n_jobs=-1, cv=skf, random_state=100, error_score=0)
# Fit the model on train data
random_grid_search.fit(X_train_smote, y_train_smote)

end= time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
print(random_grid_search.best_estimator_)

In [None]:
# See the best score
print(random_grid_search.best_score_)

In [None]:
# See the best params from random grid search
print(random_grid_search.best_params_)

### 3.2.2 Grid Search to find the best hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt", "log2"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion for quality of a split
criterion =["gini", "entropy"]

# Create Param grid
grid_params={"n_estimators" : n_estimators, "criterion" : criterion, "max_depth" : max_depth, "min_samples_split" : min_samples_split, "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
pprint(grid_params)

In [None]:
start = time.time()
# Create StratifiedK fold object
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True ,random_state=100)
# Create RandomForestClassifier Object
rf = RandomForestClassifier(random_state=100)
# Creat grid search object
grid_search = model_selection.GridSearchCV(estimator=rf, param_distributions=grid_params, scoring='roc_auc', n_jobs=-1, cv=skf, random_state=100, error_score=0)
# Fit the model on train data
grid_search.fit(X_train_smote, y_train_smote)

end= time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
print(grid_search.best_estimator_)

In [None]:
# See the best score
print(grid_search.best_score_)

In [None]:
# See the best params from random grid search
print(grid_search.best_params_)

### 3.2.3 Random Forest using best params

In [None]:
# Now train the model using these optimum hyperparametrs
rf = RandomForestClassifier(random_state=100)
rf.fit(X_train_smote,y_train_smote)
# Find the probability of the target to be 1
predict_proba= rf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_proba)
# Plot the ROC curve to see which value of tpr and FPR will be a good option to choose
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")

From the above graph we can see that 

In [None]:
# Plot graph between tpr and thresholds to choose the threshold
plt.plot(thresholds, tpr)
plt.xlabel("Thresholds")
plt.ylabel("TPR")
plt.title("TPR vs Thresholds")

From the above graph, the value of threshold correspond to TPR =0.85 should be around 0.06 

In [None]:
# By setting the threshold to be 0.06 the classes of the target will be
# Prediction for train data
y_train_pred=rf.predict_proba(X_train_smote)[:,1]>0.07
# Prediction for test data
y_pred=predict_proba>0.07

### 3.2.4 Random Forest Model evaluation

In [None]:
print("The recall score for the train data is: ", metrics.recall_score(y_train_smote, y_train_pred))
print("The precision score for the train data is: ", metrics.precision_score(y_train_smote, y_train_pred))
print("The recall score for the test data is: ", metrics.recall_score(y_test, y_pred))
print("The precision score for the test data is: ", metrics.precision_score(y_test, y_pred))

##### Build models on other algorithms to see the better performing on SMOTE

### ADASYN

### Print the class distribution after applying ADASYN

In [None]:
import warnings
warnings.filterwarnings("ignore")

from imblearn import over_sampling

ada = over_sampling.ADASYN(random_state=0)
X_train_adasyn, y_train_adasyn = ada.fit_resample(X_train, y_train)
# Artificial minority samples and corresponding minority labels from ADASYN are appended
# below X_train and y_train respectively
# So to exclusively get the artificial minority samples from ADASYN, we do
X_train_adasyn_1 = X_train_adasyn[X_train.shape[0]:].to_numpy()

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]


fig = plt.figure(figsize=[20,20])

plt.subplot(3, 1, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_adasyn_1[:X_train_1.shape[0], 0], X_train_adasyn_1[:X_train_1.shape[0], 1],
            label='Artificial ADASYN Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

In [None]:
# Now lets see the class distribution of oversampled data
classes=y_train_adasyn.value_counts()
normal_share=classes[0]/len(y_train_adasyn)*100
fraud_share=classes[1]/len(y_train_adasyn)*100
# Create a bar plot for the number and percentage of fraudulent vs non-fraudulent transcations
fig, axs = plt.subplots(1,2)
axs[0].bar(x=["normal_share", "fraud_share"], height=classes)
axs[0].set_title("Counts")
axs[1].bar(x=["normal_share", "fraud_share"], height=[normal_share, fraud_share])
axs[1].set_title("Percentage")

### Logistic regression with ADASYN

In [None]:
start=time.time()
#perform hyperparameter tuning
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
logistic_model=linear_model.LogisticRegression(random_state=100, max_iter=1000)
# define grid search
params = dict(solver=solvers,penalty=penalty,C=c_values)
# The best evaluation metric to evealuate the strength of a model is auc roc score
grid_search = model_selection.GridSearchCV(estimator=logistic_model, param_grid=params, n_jobs=-1, cv=skf, scoring='roc_auc', error_score=0)
grid_result = grid_search.fit(X_train_adasyn, y_train_adasyn)
end=time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
grid_result.best_estimator_

In [None]:
# See best score
grid_result.best_score_

In [None]:
#print the optimum value of hyperparameters
print(grid_result.best_params_)

In [None]:
# Now train the model using these optimum hyperparametrs
logistic_model=linear_model.LogisticRegression(penalty='l2', C=100, random_state=100, solver='newton-cg',max_iter=1000)
logistic_model.fit(X_train_adasyn,y_train_adasyn)
# Find the probability of the target to be 1
predict_proba= logistic_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_proba)
# Plot the ROC curve to see which value of tpr and FPR will be a good option to choose
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")

From the plot we can see that 0.85 will be a good value for TPR

In [None]:
# Plot graph between tpr and thresholds to choose the threshold
plt.plot(thresholds, tpr)
plt.xlabel("Thresholds")
plt.ylabel("TPR")
plt.title("TPR vs Thresholds")

From the above graph, the value of threshold correspond to TPR =0.85 should be around 0.06 

In [None]:
# By setting the threshold to be 0.06 the classes of the target will be
# Prediction for train data
y_train_pred=logistic_model.predict_proba(X_train_adasyn)[:,1]>0.999
# Prediction for test data
y_pred=predict_proba>0.999

In [None]:
print("The recall score for the train data is: ", metrics.recall_score(y_train_adasyn, y_train_pred))
print("The precision score for the train data is: ", metrics.precision_score(y_train_adasyn, y_train_pred))
print("The recall score for the test data is: ", metrics.recall_score(y_test, y_pred))
print("The precision score for the test data is: ", metrics.precision_score(y_test, y_pred))

## 3.2 Random Forest with ADAYSAN

In [None]:
# Import Random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# create RandomForestClassifier Object
rf = RandomForestClassifier()
# See the hyperparameters of Random forest classifier
pprint(rf.get_params())

### 3.2.1 First try Random Grid Search to get a idea about the range of hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt", "log2"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion for quality of a split
criterion =["gini", "entropy"]

# Create Param grid
grid_params={"n_estimators" : n_estimators, "criterion" : criterion, "max_depth" : max_depth, "min_samples_split" : min_samples_split, "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
pprint(grid_params)

In [None]:
start = time.time()
# Create StratifiedK fold object
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True ,random_state=100)
# Create RandomForestClassifier Object
rf = RandomForestClassifier(random_state=100)
# Creat Randomised grid search object
random_grid_search = model_selection.RandomizedSearchCV(estimator=rf, param_distributions=grid_params, n_iter=20, scoring='roc_auc', n_jobs=-1, cv=skf, random_state=100, error_score=0)
# Fit the model on train data
random_grid_search.fit(X_train_adasyn, y_train_adasyn)

end= time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
print(random_grid_search.best_estimator_)

In [None]:
# See the best score
print(random_grid_search.best_score_)

In [None]:
# See the best params from random grid search
print(random_grid_search.best_params_)

### 3.2.2 Grid Search to find the best hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt", "log2"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion for quality of a split
criterion =["gini", "entropy"]

# Create Param grid
grid_params={"n_estimators" : n_estimators, "criterion" : criterion, "max_depth" : max_depth, "min_samples_split" : min_samples_split, "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
pprint(grid_params)

In [None]:
start = time.time()
# Create StratifiedK fold object
skf=model_selection.StratifiedKFold(n_splits=5, shuffle=True ,random_state=100)
# Create RandomForestClassifier Object
rf = RandomForestClassifier(random_state=100)
# Creat grid search object
grid_search = model_selection.GridSearchCV(estimator=rf, param_distributions=grid_params, scoring='roc_auc', n_jobs=-1, cv=skf, random_state=100, error_score=0)
# Fit the model on train data
grid_search.fit(X_train_adasyn, y_train_adasyn)

end= time.time()
print("Time taken to run this is: ", round((end-start)/60, 2), " minutes")

In [None]:
# See the best estimator
print(grid_search.best_estimator_)

In [None]:
# See the best score
print(grid_search.best_score_)

In [None]:
# See the best params from random grid search
print(grid_search.best_params_)

### 3.2.3 Random Forest using best params

In [None]:
# Now train the model using these optimum hyperparametrs
rf = RandomForestClassifier(random_state=100)
rf.fit(X_train_adasyn, y_train_adasyn)
# Find the probability of the target to be 1
predict_proba= rf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predict_proba)
# Plot the ROC curve to see which value of tpr and FPR will be a good option to choose
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")

From the above graph we can see that 

In [None]:
# Plot graph between tpr and thresholds to choose the threshold
plt.plot(thresholds, tpr)
plt.xlabel("Thresholds")
plt.ylabel("TPR")
plt.title("TPR vs Thresholds")

From the above graph, the value of threshold correspond to TPR =0.85 should be around 0.06 

In [None]:
# By setting the threshold to be 0.06 the classes of the target will be
# Prediction for train data
y_train_pred=rf.predict_proba(X_train_adasyn)[:,1]>0.07
# Prediction for test data
y_pred=predict_proba>0.07

### 3.2.4 Random Forest Model evaluation

In [None]:
print("The recall score for the train data is: ", metrics.recall_score(y_train_adasyn, y_train_pred))
print("The precision score for the train data is: ", metrics.precision_score(y_train_adasyn, y_train_pred))
print("The recall score for the test data is: ", metrics.recall_score(y_test, y_pred))
print("The precision score for the test data is: ", metrics.precision_score(y_test, y_pred))

In [None]:
#perform cross validation & then balance classes on X_train_cv & y_train_cv using ADASYN

#perform hyperparameter tuning

#print the evaluation result by choosing a evaluation metric

#print the optimum value of hyperparameters


##### Build models on other algorithms to see the better performing on ADASYN

### Select the oversampling method which shows the best result on a model
- Apply the best hyperparameter on the model
- Predict on the test dataset

In [None]:
# perform the best oversampling method on X_train & y_train

clf = ___  #initialise the model with optimum hyperparameters
clf.fit( ) # fit on the balanced dataset
print() --> #print the evaluation score on the X_test by choosing the best evaluation metric

### Print the important features of the best model to understand the dataset

In [None]:
var_imp = []
for i in clf.feature_importances_:
    var_imp.append(i)
print('Top var =', var_imp.index(np.sort(clf.feature_importances_)[-1])+1)
print('2nd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-2])+1)
print('3rd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-3])+1)

# Variable on Index-13 and Index-9 seems to be the top 2 variables
top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-1])
second_top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-2])

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]

np.random.shuffle(X_train_0)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 20]

plt.scatter(X_train_1[:, top_var_index], X_train_1[:, second_top_var_index], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], top_var_index], X_train_0[:X_train_1.shape[0], second_top_var_index],
            label='Actual Class-0 Examples')
plt.legend()

In [None]:
#### Print the FPR,TPR & select the best threshold from the roc curve

In [None]:
print('Train auc =', metrics.roc_auc_score(_________)
fpr, tpr, thresholds = metrics.roc_curve(_________)
threshold = thresholds[np.argmax(tpr-fpr)]
print(threshold)