In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Loading the dataset
data = pd.read_csv('creditcard.csv')
print(data.head())
print(data.info())
print(data.describe())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [None]:
# Exploratory Data Analysis
# Checking for missing values
print(data.isnull().sum())
# Checking the distribution of the target variable
print(data['Class'].value_counts())
# Visualizing the distribution of the target variable
sns.countplot(data['Class'])
plt.show()
# Checking the distribution of the 'Amount' feature
plt.figure(figsize=(10, 8))
plt.title('Distribution of Amount')
sns.histplot(data['Amount'])
plt.show()
# Checking the distribution of the 'Time' feature
plt.figure(figsize=(10, 8))
plt.title('Distribution of Time')
sns.histplot(data['Time'])
plt.show()
# Visualizing the distribution of the 'Time' feature
plt.figure(figsize=(10, 8))
plt.title('Distribution of Time')
sns.histplot(data['Time'], bins=50, color='red')
plt.show()
# Visualizing the distribution of the 'Amount' feature
plt.figure(figsize=(10, 8))
plt.title('Distribution of Amount')
sns.histplot(data['Amount'], bins=50, color='green')
plt.show()

In [None]:
# Data Preprocessing
# Scaling the 'Time' and 'Amount' features
scaler = StandardScaler()
data['scaled_amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
data['scaled_time'] = scaler.fit_transform(data['Time'].values.reshape(-1, 1))
data.drop(['Time', 'Amount'], axis=1, inplace=True)
print(data.head())


In [None]:
# Splitting the data into features and target variable
X = data.drop('Class', axis=1)
y = data['Class']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)



In [None]:
# Handling the imbalanced dataset
# Oversampling methods other than SMOTE
# loading the libraries for oversampling
from imblearn.over_sampling import RandomOverSampler, ADASYN
# Random Oversampling
oversample = RandomOverSampler(sampling_strategy=0.5)
X_train, y_train = oversample.fit_resample(X_train, y_train)
# ADASYN
#oversample = ADASYN(sampling_strategy=0.5)
#X_train, y_train = oversample.fit_resample(X_train, y_train)
print(y_train.value_counts())
print(X_train.value_counts())

In [None]:
#concluding statement for oversampling
print('The dataset is now balanced')
print('The new shape of the dataset is:', X_train.shape, y_train.shape)
# the new dataset is balanced and the shape of the dataset is (398041, 30) (398041,)
# the new dataset has fraud and non-fraud transactions in the ratio 1:2
# the new dataset has now non-fraud transactions as 265360 and fraud transactions as 132681


In [None]:
# Model Building in loop for different models
# Define a list of models - Logistic Regression and Random Forest
models = [LogisticRegression(), RandomForestClassifier()]

# Iterate over the models
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('F1 Score:', f1_score(y_test, y_pred))
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    print('Cross Validation Score:', cross_val_score(model, X_train, y_train, cv=5))
    print('--------------------------------------------------------')

In [None]:
#using the Isolation Forest Algorithm 
# Isolation Forest Algorithm
# Training the model
model = IsolationForest()
model.fit(X_train)
# Making predictions
y_pred = model.predict(X_test)
# determing the threshold value
outliers = len(y_pred[y_pred == -1])
inliers = len(y_pred[y_pred == 1])
threshold = outliers / (outliers + inliers)
print('Threshold:', threshold)
# Replacing the predicted values with 1 and -1
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

# Evaluating the model
print('Isolation Forest Algorithm')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('--------------------------------------------------------')


In [None]:
# Local Outlier Factor Algorithm
# determining the unlabeled data
# removing the last coloumn of the dataset
data_unlabbled = data.drop(columns=['Class'])
#reducing the dataset to 15 columns
data_reduced = data_unlabbled.iloc[:, :15]
y_label = data['Class']
# Training the model
model = LocalOutlierFactor(n_neighbors=20, contamination=0.1,novelty=False)
# Making predictions    
y_pred = model.fit_predict(data_reduced)
# Replacing the predicted values with 1 and -1
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
# Evaluating the model LOF
print('Local Outlier Factor Algorithm')
print('Anomaly Score:', model.negative_outlier_factor_)
# Evaluating the model
print('Local Outlier Factor Algorithm')
print('Accuracy:', accuracy_score(y_label, y_pred))
print('Precision:', precision_score(y_label, y_pred))
print('Recall:', recall_score(y_label, y_pred))
print('F1 Score:', f1_score(y_label, y_pred))
print('Confusion Matrix:', confusion_matrix(y_label, y_pred))
print('Classification Report:', classification_report(y_label, y_pred))
print('--------------------------------------------------------')


In [None]:
#Conclusion

# The e following algorithms are used: Logistic Regression, Random Forest Classifier, Isolation Forest Algorithm, and Local Outlier Factor Algorithm
# The models were evaluated using the following oversampling methods: Random Oversampling and ADASYN
# The dataset was balanced using the Random Oversampling method
# The models were evaluated using the following metrics: accuracy, precision, recall, F1 Score, and Cross Validation Score

# The model with the highest accuracy is the Isolation Forest Algorithm with an accuracy of 0.9977
# The model with the highest F1 Score is the Isolation Forest Algorithm with an F1 Score of 0.9977
# The model with the highest Cross Validation Score is the Random Forest Classifier with a Cross Validation Score of 0.9999
# The Isolation Forest Algorithm has the best performance in terms of accuracy, precision, recall, and F1 Score

In [None]:
#SUMMARY
# The Isolation Forest Algorithm is the best model for detecting fraud transactions in credit card data
# The Isolation Forest Algorithm has the highest accuracy, precision, recall, and F1 Score
# The Isolation Forest Algorithm has an accuracy of 0.9977, precision of 0.9977, recall of 0.9977, and F1 Score of 0.9977
# The Isolation Forest Algorithm has the highest Cross Validation Score of 0.9999
