In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTENC

In [32]:
#first Data preprocessing phase is to remove the special characters , cleanining the nulls
df= pd.read_csv('fraud.csv')
df = df.map(lambda x: x.replace("'", "") if isinstance(x, str) else x) # removing the ' form the data set for better data types importing
df['gender'].fillna(df['gender'].mode()[0], inplace=True) # fill the gender nulls with majority.
df.loc[df['gender'] == 'E', 'age'] = '7'     # replace the nulls in age with 7 (new Category) for the E gender as all nulls in gender are related to E gender.
df['age']=df['age'].astype(int) # to convert the dtype of age to integers after removing all ' from DS

In [33]:
#drop the uneeded features according to EDA they have only one value
df.drop(columns=['zipcodeOri','zipMerchant'], inplace=True)

In [34]:
#split the data for test data and validation data so we can test the model perfromance on unseen data set 
train_df, val_df = train_test_split(df, test_size=0.20, random_state=42)

In [35]:
from sklearn.preprocessing import LabelEncoder
import joblib
# Create label encoders with the 'ignore' handle_unknown parameter
label_encoder_category = LabelEncoder()
label_encoder_gender = LabelEncoder()
label_encoder_customer = LabelEncoder()
label_encoder_merchant = LabelEncoder()

# Fit and transform categorical variables for training data with 'ignore' handle_unknown
train_df['category'] = label_encoder_category.fit_transform(train_df['category'])
train_df['gender'] = label_encoder_gender.fit_transform(train_df['gender'])
train_df['customer'] = label_encoder_customer.fit_transform(train_df['customer'])
train_df['merchant'] = label_encoder_merchant.fit_transform(train_df['merchant'])

# Save the label encoders
joblib.dump(label_encoder_category, 'label_encoder_category.pkl')
joblib.dump(label_encoder_gender, 'label_encoder_gender.pkl')
joblib.dump(label_encoder_customer, 'label_encoder_customer.pkl')
joblib.dump(label_encoder_merchant, 'label_encoder_merchant.pkl')



['label_encoder_merchant.pkl']

In [36]:
import joblib
from sklearn.preprocessing import MinMaxScaler

# Assuming 'train_df' is your training DataFrame
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(train_df)
normalized_df_tr = pd.DataFrame(normalized_data, columns=train_df.columns)

# Save the scaler
joblib.dump(scaler, 'min_max_scaler.pkl')


['min_max_scaler.pkl']

In [7]:
# Apply SMOTEENN to the dataset with specified parameters to oversample the minority classes and undersample the majorairy classes

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC

X = normalized_df_tr.drop('fraud', axis=1)  # Features
y = normalized_df_tr['fraud']  # Target variable

sampling_strategy = 'auto'
print("SMOTENN Sampling Strategy = auto")
categorical_features = [1, 2, 3, 4, 5]

smoteenn = SMOTEENN(sampling_strategy=sampling_strategy, smote=SMOTENC(categorical_features=categorical_features, random_state=42))
X_resampled, y_resampled = smoteenn.fit_resample(X, y)

SMOTENN Sampling Strategy = auto


In [8]:
#This step is the DT ML training including Kfold to increase the model perfroamce and generlaization and overcome the overfiting issue
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
# Assuming X_resampled and y_resampled are your features and target variable
X_resampled.reset_index(drop=True, inplace=True)
y_resampled.reset_index(drop=True, inplace=True)

# Define hyperparameter grid 
param_grid = {
    'max_depth': [None, 5, 10, 15,20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Initialize Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Initialize cross-validation for 5 splits can be increased and decreased to reach better model perfromace
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Grid Search with cross-validation for better model selection
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=cv, scoring='accuracy')

# Perform cross-validation with SMOTE
cv_scores = []
conf_matrices = []

# Iterate over cross-validation folds
for train_index, test_index in cv.split(X_resampled, y_resampled):
    # Get train and test data for this fold
    X_train_fold, X_test_fold = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train_fold, y_test_fold = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Fit Grid Search to find the best model for this fold
    grid_search.fit(X_train_fold, y_train_fold)
    
    # Get the best model from the grid search
    best_model = grid_search.best_estimator_
    
    # Make predictions on the test data for this fold using the best model
    y_pred_fold = best_model.predict(X_test_fold)
    
    # Calculate accuracy for this fold
    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
    cv_scores.append(accuracy_fold)
    
    # Calculate confusion matrix for this fold
    conf_matrix_fold = confusion_matrix(y_test_fold, y_pred_fold)
    print("Confusion Matrix for Fold:")
    print(conf_matrix_fold)
    
    # Append the confusion matrix to the list
    conf_matrices.append(conf_matrix_fold)
    
# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", np.mean(cv_scores))


# Fit Grid Search to find the best model on the entire dataset
grid_search.fit(X_resampled, y_resampled)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Save the best model to a file
joblib.dump(best_model, 'best_decision_tree_model.pkl')


Confusion Matrix for Fold:
[[91377   409]
 [  246 93068]]
Confusion Matrix for Fold:
[[91360   426]
 [  261 93053]]
Confusion Matrix for Fold:
[[91388   397]
 [  254 93061]]
Confusion Matrix for Fold:
[[91350   435]
 [  268 93047]]
Confusion Matrix for Fold:
[[91318   467]
 [  239 93076]]
Cross-validation scores: [0.9964613722312263, 0.9962884927066451, 0.996482982171799, 0.9962020529443544, 0.9961858454889249]
Mean CV score: 0.9963241491085899


['best_decision_tree_model.pkl']

In [9]:
import plotly.figure_factory as ff
import numpy as np

# Define lists to store precision, recall, accuracy, and F1 score for each fold
precisions = []
recalls = []
accuracies = []
f1_scores = []

# Plot confusion matrix for each fold using Plotly
for i, conf_matrix in enumerate(conf_matrices):
    # Calculate precision, recall, accuracy, and F1 score
    tp = conf_matrix[1, 1]
    fp = conf_matrix[0, 1]
    fn = conf_matrix[1, 0]
    tn = conf_matrix[0, 0]
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)
    f1_scores.append(f1_score)
    
    # Print precision, recall, accuracy, and F1 score for the current fold
    print(f"Fold {i+1} - Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, F1 Score: {f1_score:.4f}")
    
    # Plot confusion matrix
    fig = ff.create_annotated_heatmap(z=conf_matrix, x=['Predicted Negative', 'Predicted Positive'],
                                      y=['Actual Negative', 'Actual Positive'], annotation_text=conf_matrix.tolist(),
                                      colorscale='Blues')
    fig.update_layout(title=f'Confusion Matrix - Fold {i+1}', xaxis_title='Predicted Labels', yaxis_title='True Labels')
    fig.show()

# Calculate and print average precision, recall, accuracy, and F1 score across all folds
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_accuracy = np.mean(accuracies)
avg_f1_score = np.mean(f1_scores)
print(f"Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}, Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1_score:.4f}")


Fold 1 - Precision: 0.9956, Recall: 0.9974, Accuracy: 0.9965, F1 Score: 0.9965


Fold 2 - Precision: 0.9954, Recall: 0.9972, Accuracy: 0.9963, F1 Score: 0.9963


Fold 3 - Precision: 0.9958, Recall: 0.9973, Accuracy: 0.9965, F1 Score: 0.9965


Fold 4 - Precision: 0.9953, Recall: 0.9971, Accuracy: 0.9962, F1 Score: 0.9962


Fold 5 - Precision: 0.9950, Recall: 0.9974, Accuracy: 0.9962, F1 Score: 0.9962


Average Precision: 0.9954, Average Recall: 0.9973, Average Accuracy: 0.9963, Average F1 Score: 0.9964


In [21]:
import plotly.graph_objs as go
import numpy as np
from sklearn.metrics import roc_curve, auc

# Initialize an empty list to store the true positive rate (TPR) and false positive rate (FPR) for each fold
roc_traces = []

# Iterate over cross-validation folds
for i, (train_index, test_index) in enumerate(cv.split(X_resampled, y_resampled)):
    # Get train and test data for this fold
    X_train_fold, X_test_fold = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train_fold, y_test_fold = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Fit the model on the train data for this fold
    dt_classifier.fit(X_train_fold, y_train_fold)
    
    # Make predictions on the test data for this fold
    y_pred_fold = dt_classifier.predict_proba(X_test_fold)[:, 1]  # Predict probabilities for the positive class
    
    # Calculate the false positive rate (FPR) and true positive rate (TPR) for this fold
    fpr, tpr, _ = roc_curve(y_test_fold, y_pred_fold)
    
    # Calculate the AUC (Area Under the ROC Curve) for this fold
    auc_score = auc(fpr, tpr)
    
    # Plot the ROC curve for this fold
    roc_trace = go.Scatter(x=fpr, y=tpr, 
                           mode='lines', 
                           line=dict(width=1),
                           name=f'Fold {i+1} (AUC = {auc_score:.4f})')
    
    roc_traces.append(roc_trace)

layout = go.Layout(title='Receiver Operating Characteristic (ROC) Curve - Cross Validation',
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'),
                   legend=dict(x=0.05, y=0.95, bgcolor='rgba(255, 255, 255, 0.5)', bordercolor='rgba(0, 0, 0, 0.5)'))

fig = go.Figure(data=roc_traces, layout=layout)
fig.show()


In [37]:
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the saved label encoders
label_encoder_category = joblib.load('label_encoder_category.pkl')
label_encoder_gender = joblib.load('label_encoder_gender.pkl')
label_encoder_customer = joblib.load('label_encoder_customer.pkl')
label_encoder_merchant = joblib.load('label_encoder_merchant.pkl')

# Fit and transform categorical variables
val_df['category'] = label_encoder_category.fit_transform(val_df['category'])
val_df['gender'] = label_encoder_gender.fit_transform(val_df['gender'])
val_df['customer'] = label_encoder_customer.fit_transform(val_df['customer'])
val_df['merchant'] = label_encoder_merchant.fit_transform(val_df['merchant'])



In [38]:
import joblib

# Load the scaler
scaler = joblib.load('min_max_scaler.pkl')

# Assuming 'test_df' is your test DataFrame
# Transform the test data using the fitted scaler
normalized_data_val = scaler.transform(val_df)
normalized_df_val = pd.DataFrame(normalized_data_val, columns=val_df.columns)



In [39]:
import joblib
import plotly.figure_factory as ff


# Load the saved model
loaded_model = joblib.load('best_decision_tree_model.pkl')

X_val = normalized_df_val.drop('fraud', axis=1)  # Adjust 'target_column_name' to your target column name
y_val = normalized_df_val['fraud']  # Adjust 'target_column_name' to your target column name

# Make predictions on the validation dataset
y_pred = loaded_model.predict(X_val)

# Evaluate the model performance on the validation dataset
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Plot Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_pred)
# Define the labels for the confusion matrix cells
labels = [['TN', 'FP'], ['FN', 'TP']]

# Define the text for the confusion matrix cells
text = [[str(conf_matrix[0, 0]), str(conf_matrix[0, 1])],
        [str(conf_matrix[1, 0]), str(conf_matrix[1, 1])]]

# Create a heatmap figure using Plotly
fig = ff.create_annotated_heatmap(z=conf_matrix, x=['Predicted Negative', 'Predicted Positive'],
                                  y=['Actual Negative', 'Actual Positive'], annotation_text=text, colorscale='Blues')

# Update the layout of the figure
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted Labels', yaxis_title='True Labels')

# Add cell labels to the heatmap
for i in range(len(labels)):
    for j in range(len(labels[0])):
        fig.add_annotation(text=labels[i][j], x=['Predicted Negative', 'Predicted Positive'][j], y=['Actual Negative', 'Actual Positive'][i],
                           font=dict(color='black', size=14))

# Show the plot
fig.show()

Validation Accuracy: 0.984528584281378
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99    117512
         1.0       0.43      0.88      0.58      1417

    accuracy                           0.98    118929
   macro avg       0.71      0.93      0.78    118929
weighted avg       0.99      0.98      0.99    118929

Confusion Matrix:
[[115841   1671]
 [   169   1248]]


In [14]:
import joblib
import pandas as pd
from sklearn.tree import export_graphviz
import graphviz

# Load the saved model
loaded_model = joblib.load('best_decision_tree_model.pkl')

# Load the saved scaler
scaler = joblib.load('min_max_scaler.pkl')

# Load the saved label encoders
label_encoder_category = joblib.load('label_encoder_category.pkl')
label_encoder_gender = joblib.load('label_encoder_gender.pkl')
label_encoder_customer = joblib.load('label_encoder_customer.pkl')
label_encoder_merchant = joblib.load('label_encoder_merchant.pkl')

# Assuming 'normalized_df_val' is your normalized and encoded DataFrame

# Denormalize all features
denormalized_data_val = scaler.inverse_transform(normalized_df_val)

# Convert denormalized data to DataFrame
denormalized_df_val = pd.DataFrame(denormalized_data_val, columns=normalized_df_val.columns)

# Decode the categorical features
denormalized_df_val['category'] = label_encoder_category.inverse_transform(denormalized_df_val['category'].astype(int))
denormalized_df_val['gender'] = label_encoder_gender.inverse_transform(denormalized_df_val['gender'].astype(int))
denormalized_df_val['customer'] = label_encoder_customer.inverse_transform(denormalized_df_val['customer'].astype(int))
denormalized_df_val['merchant'] = label_encoder_merchant.inverse_transform(denormalized_df_val['merchant'].astype(int))

# Export the decision tree as a DOT file
dot_data = export_graphviz(loaded_model, 
                           feature_names=denormalized_df_val.columns[:-1],  # Exclude the target column 'fraud'
                           class_names=['Not Fraud', 'Fraud'],
                           filled=True,
                           special_characters=True,
                           label='all',
                           leaves_parallel=False,
                           impurity=False,
                           node_ids=False,
                           proportion=False,
                           rotate=False,
                           max_depth=8,
                           precision=2)

# Visualize the decision tree using Graphviz
graph = graphviz.Source(dot_data)
graph.render('decision_tree', format='png', cleanup=True)  # Save as PNG
graph.view()  # Display the decision tree


'decision_tree.pdf'