In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
#first Data preprocessing phase is to remove the special characters , cleanining the nulls
df= pd.read_csv('fraud.csv')
df = df.map(lambda x: x.replace("'", "") if isinstance(x, str) else x) # removing the ' form the data set for better data types importing
df['gender'].fillna(df['gender'].mode()[0], inplace=True) # fill the gender nulls with majority.
df.loc[df['gender'] == 'E', 'age'] = '7'     # replace the nulls in age with 7 (new Category) for the E gender as all nulls in gender are related to E gender.
df['age']=df['age'].astype(int) # to convert the dtype of age to integers after removing all ' from DS

In [3]:
#drop the not needed features from the dataset
df.drop(columns=['zipcodeOri','zipMerchant'], inplace=True)

In [4]:
#split the data for test data and validation data so we can test the model perfromance on unseen data set 
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# Encode categorical variables for the training data set 

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Encode categorical variables
label_encoder = LabelEncoder()
train_df['category'] = label_encoder.fit_transform(train_df['category'])
train_df['gender'] = label_encoder.fit_transform(train_df['gender'])
train_df['customer'] = label_encoder.fit_transform(train_df['customer'])
train_df['merchant'] = label_encoder.fit_transform(train_df['merchant'])

In [6]:
#min max scaler for data normalization for better model robustness and perfromacne
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(train_df)
normalized_df_tr = pd.DataFrame(normalized_data, columns=train_df.columns)

In [7]:
#identigy the categorical coloumns for smote prepration below
for idx, col in enumerate(normalized_df_tr.columns):
    print(f"Column '{col}' has index {idx}")

Column 'step' has index 0
Column 'customer' has index 1
Column 'age' has index 2
Column 'gender' has index 3
Column 'merchant' has index 4
Column 'category' has index 5
Column 'amount' has index 6
Column 'fraud' has index 7


In [8]:
# Apply SMOTEENN to the dataset with specified parameters to oversample the minority classes and undersample the majorairy classes

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC

X = normalized_df_tr.drop('fraud', axis=1)  # Features
y = normalized_df_tr['fraud']  # Target variable

sampling_strategy = 'auto'
print("SMOTENN Sampling Strategy = auro")
categorical_features = [1, 2, 3, 4, 5]

smote = SMOTENC(categorical_features=categorical_features, random_state=42, n_jobs=-1)  # Set n_jobs to -1 for maximum parallelization if supported (trial on my machine)
smoteenn = SMOTEENN(sampling_strategy=sampling_strategy, smote=smote)
X_resampled, y_resampled = smoteenn.fit_resample(X, y)

SMOTENN Sampling Strategy = auro


In [9]:
#LR Model Training Script with specific hyper parameters , 
#these parameters can be adjusted but due to time limitation many trials lead to the best perfromacne as below settings

import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
import pandas as pd
import numpy as np

# Assuming X_resampled and y_resampled are your features and target variable
X_resampled.reset_index(drop=True, inplace=True)
y_resampled.reset_index(drop=True, inplace=True)

hyperparameters = {
    'penalty': 'elasticnet',
    'C': 1,
    'l1_ratio': 0.2,
    'fit_intercept': True,
    'max_iter': 100000,
    'tol': 0.00001,
    'random_state': 42,
    'solver':'saga'
}  ##Elastic

#if you need to try another regulraization method comment the elasticnet and uncoment one of the below hyperparameters.

#hyperparameters = {
#    'penalty': 'l2',        # Ridge (l2) penalty
#    'solver': 'saga',       # Optimization algorithm
#    'C': .5,               # Inverse of regularization strength
#    'max_iter': 100000,       # Maximum number of iterations
#    'tol': 0.00001,          # Tolerance for stopping criteria
#    'random_state': 42      # Random state for reproducibility
#} ##ridge_params


#hyperparameters = {
#    'penalty': 'l1',
#    'C': 0.5,
#    'fit_intercept': True,
#    'max_iter': 100000,
#    'tol': 0.0001,
#    'solver': 'saga'
#}  ##lasso


# Initialize Logistic Regression classifier
lr_classifier = LogisticRegression(**hyperparameters)

# Initialize cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = []
conf_matrices = []

# Iterate over cross-validation folds
for train_index, test_index in cv.split(X_resampled, y_resampled):
    # Get train and test data for this fold
    X_train_fold, X_test_fold = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train_fold, y_test_fold = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Fit the model on the train data for this fold
    lr_classifier.fit(X_train_fold, y_train_fold)
    
    # Make predictions on the test data for this fold
    y_pred_fold = lr_classifier.predict(X_test_fold)
    
    # Calculate accuracy for this fold
    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
    cv_scores.append(accuracy_fold)
    
    # Calculate confusion matrix for this fold
    conf_matrix_fold = confusion_matrix(y_test_fold, y_pred_fold)
  
    # Append the confusion matrix to the list
    conf_matrices.append(conf_matrix_fold)
    
# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", np.mean(cv_scores))


# Fit the model on the entire dataset
lr_classifier.fit(X_resampled, y_resampled)

# Save the model to a file
joblib.dump(lr_classifier, 'best_logistic_regression_model.pkl')


Cross-validation scores: [0.9369151809832523, 0.9367801188546732, 0.9375850891410049, 0.9367206915180983, 0.9367801188546732]
Mean CV score: 0.9369562398703405


['best_logistic_regression_model.pkl']

In [10]:
#plot the model perfromacne metric for each fold along with the confusion matrix
import plotly.figure_factory as ff
import numpy as np

# Define lists to store precision, recall, accuracy, and F1 score for each fold
precisions = []
recalls = []
accuracies = []
f1_scores = []

# Plot confusion matrix for each fold using Plotly
for i, conf_matrix in enumerate(conf_matrices):
    # Calculate precision, recall, accuracy, and F1 score
    tp = conf_matrix[1, 1]
    fp = conf_matrix[0, 1]
    fn = conf_matrix[1, 0]
    tn = conf_matrix[0, 0]
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)
    f1_scores.append(f1_score)
    
    # Print precision, recall, accuracy, and F1 score for the current fold
    print(f"Fold {i+1} - Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, F1 Score: {f1_score:.4f}")
    
    # Plot confusion matrix
    fig = ff.create_annotated_heatmap(z=conf_matrix, x=['Predicted Negative', 'Predicted Positive'],
                                      y=['Actual Negative', 'Actual Positive'], annotation_text=conf_matrix.tolist(),
                                      colorscale='Blues')
    fig.update_layout(title=f'Confusion Matrix - Fold {i+1}', xaxis_title='Predicted Labels', yaxis_title='True Labels')
    fig.show()

# Calculate and print average precision, recall, accuracy, and F1 score across all folds
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_accuracy = np.mean(accuracies)
avg_f1_score = np.mean(f1_scores)
print(f"Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}, Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1_score:.4f}")


Fold 1 - Precision: 0.9682, Recall: 0.9046, Accuracy: 0.9369, F1 Score: 0.9353


Fold 2 - Precision: 0.9680, Recall: 0.9045, Accuracy: 0.9368, F1 Score: 0.9352


Fold 3 - Precision: 0.9679, Recall: 0.9062, Accuracy: 0.9376, F1 Score: 0.9361


Fold 4 - Precision: 0.9675, Recall: 0.9048, Accuracy: 0.9367, F1 Score: 0.9351


Fold 5 - Precision: 0.9673, Recall: 0.9051, Accuracy: 0.9368, F1 Score: 0.9352


Average Precision: 0.9678, Average Recall: 0.9051, Average Accuracy: 0.9370, Average F1 Score: 0.9354


In [11]:
#plot the ROC curve for each fold to check the AUC and model peformance for each fold
import plotly.graph_objs as go
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression

# Initialize an empty list to store the true positive rate (TPR) and false positive rate (FPR) for each fold
roc_traces = []

# Initialize Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42)

# Iterate over cross-validation folds
for i, (train_index, test_index) in enumerate(cv.split(X_resampled, y_resampled)):
    # Get train and test data for this fold
    X_train_fold, X_test_fold = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train_fold, y_test_fold = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Fit the model on the train data for this fold
    lr_classifier.fit(X_train_fold, y_train_fold)
    
    # Make predictions on the test data for this fold
    y_pred_prob_fold = lr_classifier.predict_proba(X_test_fold)[:, 1]  # Predict probabilities for the positive class
    
    # Calculate the false positive rate (FPR) and true positive rate (TPR) for this fold
    fpr, tpr, _ = roc_curve(y_test_fold, y_pred_prob_fold)
    
    # Calculate the AUC (Area Under the ROC Curve) for this fold
    auc_score = auc(fpr, tpr)
    
    # Plot the ROC curve for this fold
    roc_trace = go.Scatter(x=fpr, y=tpr, 
                           mode='lines', 
                           line=dict(width=1),
                           name=f'Fold {i+1} (AUC = {auc_score:.4f})')
    
    roc_traces.append(roc_trace)

layout = go.Layout(title='Receiver Operating Characteristic (ROC) Curve - Cross Validation',
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'),
                   legend=dict(x=0.05, y=0.95, bgcolor='rgba(255, 255, 255, 0.5)', bordercolor='rgba(0, 0, 0, 0.5)'))

fig = go.Figure(data=roc_traces, layout=layout)
fig.show()


In [12]:
#Encode the validation categorical parameters
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Encode categorical variables
label_encoder = LabelEncoder()
val_df['category'] = label_encoder.fit_transform(val_df['category'])
val_df['gender'] = label_encoder.fit_transform(val_df['gender'])
val_df['customer'] = label_encoder.fit_transform(val_df['customer'])
val_df['merchant'] = label_encoder.fit_transform(val_df['merchant'])

In [13]:
#min max scaler for the validation data set
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_data_val = scaler.fit_transform(val_df)
# Create a new DataFrame with the normalized data
normalized_df_val = pd.DataFrame(normalized_data_val, columns=val_df.columns)

In [14]:
# test the best reulted model on an unseen data set (validation dataset) to see how the model will perfrom on unseen data.
import joblib
# Load the saved model
loaded_model = joblib.load('best_logistic_regression_model.pkl')

X_val = normalized_df_val.drop('fraud', axis=1)  # Adjust 'target_column_name' to your target column name
y_val = normalized_df_val['fraud']  # Adjust 'target_column_name' to your target column name

# Make predictions on the validation dataset
y_pred = loaded_model.predict(X_val)

# Evaluate the model performance on the validation dataset
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Plot Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_pred)
# Define the labels for the confusion matrix cells
labels = [['TN', 'FP'], ['FN', 'TP']]

# Define the text for the confusion matrix cells
text = [[str(conf_matrix[0, 0]), str(conf_matrix[0, 1])],
        [str(conf_matrix[1, 0]), str(conf_matrix[1, 1])]]

# Create a heatmap figure using Plotly
fig = ff.create_annotated_heatmap(z=conf_matrix, x=['Predicted Negative', 'Predicted Positive'],
                                  y=['Actual Negative', 'Actual Positive'], annotation_text=text, colorscale='Blues')

# Update the layout of the figure
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted Labels', yaxis_title='True Labels')

# Add cell labels to the heatmap
for i in range(len(labels)):
    for j in range(len(labels[0])):
        fig.add_annotation(text=labels[i][j], x=['Predicted Negative', 'Predicted Positive'][j], y=['Actual Negative', 'Actual Positive'][i],
                           font=dict(color='black', size=14))

# Show the plot
fig.show()

Validation Accuracy: 0.9538295958092643
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.98    117512
         1.0       0.19      0.88      0.31      1417

    accuracy                           0.95    118929
   macro avg       0.59      0.92      0.64    118929
weighted avg       0.99      0.95      0.97    118929

Confusion Matrix:
[[112186   5326]
 [   165   1252]]
