In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
df= pd.read_csv('fraud.csv')
df = df.map(lambda x: x.replace("'", "") if isinstance(x, str) else x)
df['gender'].fillna(df['gender'].mode()[0], inplace=True)
df.loc[df['gender'] == 'E', 'age'] = '7'
df['age']=df['age'].astype(int)

In [4]:
df.drop(columns=['zipcodeOri','zipMerchant'], inplace=True)

In [45]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=425)

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Encode categorical variables
label_encoder = LabelEncoder()
train_df['category'] = label_encoder.fit_transform(train_df['category'])
train_df['gender'] = label_encoder.fit_transform(train_df['gender'])
train_df['age'] = label_encoder.fit_transform(train_df['age'])
train_df['customer'] = label_encoder.fit_transform(train_df['customer'])
train_df['merchant'] = label_encoder.fit_transform(train_df['merchant'])
train_df['amount'] = label_encoder.fit_transform(train_df['amount'])


In [7]:
#min max scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(train_df)
# Create a new DataFrame with the normalized data
normalized_df_tr = pd.DataFrame(normalized_data, columns=train_df.columns)

In [8]:
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC

X = normalized_df_tr.drop('fraud', axis=1)  # Features
y = normalized_df_tr['fraud']  # Target variable

sampling_strategy = 'auto'
print("SMOTENN Sampling Strategy = auto")
categorical_features = [1, 2, 3, 4, 5]

# Apply SMOTEENN to the dataset with specified parameters
smote = SMOTENC(categorical_features=categorical_features, random_state=42, n_jobs=-1)  # Set n_jobs to -1 for maximum parallelization if supported
smoteenn = SMOTEENN(sampling_strategy=sampling_strategy, smote=smote)
X_resampled, y_resampled = smoteenn.fit_resample(X, y)

SMOTENN Sampling Strategy = auto


In [None]:
#parralle computation for xgboost
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
import numpy as np
from joblib import Parallel, delayed

# Assuming X_resampled and y_resampled are your features and target variable
X_resampled.reset_index(drop=True, inplace=True)
y_resampled.reset_index(drop=True, inplace=True)

param_grid = {
    'max_depth': [3, 5, 7],  
    'min_child_weight': [1, 3, 5], 
    'gamma': [0, 0.1, 0.2],  
    'subsample': [0.8, 1],  
    'colsample_bytree': [0.8, 1], 
    'learning_rate': [0.01, 0.1, 0.2]  
}

# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Initialize cross-validation for 5 splits
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Grid Search with cross-validation for better model selection
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=cv, scoring='accuracy')

# Perform cross-validation
cv_scores = []
conf_matrices = []

# Define function to train and evaluate model for a single fold
def train_and_evaluate_fold(train_index, test_index):
    # Get train and test data for this fold
    X_train_fold, X_test_fold = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train_fold, y_test_fold = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Fit Grid Search to find the best model for this fold
    grid_search.fit(X_train_fold, y_train_fold)
    
    # Get the best model from the grid search
    best_model = grid_search.best_estimator_
    
    # Make predictions on the test data for this fold using the best model
    y_pred_fold = best_model.predict(X_test_fold)
    
    # Calculate accuracy for this fold
    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
    
    # Calculate confusion matrix for this fold
    conf_matrix_fold = confusion_matrix(y_test_fold, y_pred_fold)
    
    return accuracy_fold, conf_matrix_fold

# Parallelize the training and evaluation of each fold
results = Parallel(n_jobs=-1)(
    delayed(train_and_evaluate_fold)(train_index, test_index)
    for train_index, test_index in cv.split(X_resampled, y_resampled)
)

# Extract cross-validation scores and confusion matrices
cv_scores, conf_matrices = zip(*results)

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", np.mean(cv_scores))

# Fit Grid Search to find the best model on the entire dataset
grid_search.fit(X_resampled, y_resampled)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Save the best model to a file
joblib.dump(best_model, 'best_xgboost_model.pkl')

In [50]:
import plotly.figure_factory as ff
import numpy as np

# Define lists to store precision, recall, accuracy, and F1 score for each fold
precisions = []
recalls = []
accuracies = []
f1_scores = []

# Plot confusion matrix for each fold using Plotly
for i, conf_matrix in enumerate(conf_matrices):
    # Calculate precision, recall, accuracy, and F1 score
    tp = conf_matrix[1, 1]
    fp = conf_matrix[0, 1]
    fn = conf_matrix[1, 0]
    tn = conf_matrix[0, 0]
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)
    f1_scores.append(f1_score)
    
    # Print precision, recall, accuracy, and F1 score for the current fold
    print(f"Fold {i+1} - Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, F1 Score: {f1_score:.4f}")
    
    # Plot confusion matrix
    fig = ff.create_annotated_heatmap(z=conf_matrix, x=['Predicted Negative', 'Predicted Positive'],
                                      y=['Actual Negative', 'Actual Positive'], annotation_text=conf_matrix.tolist(),
                                      colorscale='Blues')
    fig.update_layout(title=f'Confusion Matrix - Fold {i+1}', xaxis_title='Predicted Labels', yaxis_title='True Labels')
    fig.show()

# Calculate and print average precision, recall, accuracy, and F1 score across all folds
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_accuracy = np.mean(accuracies)
avg_f1_score = np.mean(f1_scores)
print(f"Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}, Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1_score:.4f}")


Fold 1 - Precision: 0.9955, Recall: 0.9992, Accuracy: 0.9973, F1 Score: 0.9973


Fold 2 - Precision: 0.9955, Recall: 0.9991, Accuracy: 0.9973, F1 Score: 0.9973


Fold 3 - Precision: 0.9955, Recall: 0.9993, Accuracy: 0.9974, F1 Score: 0.9974


Fold 4 - Precision: 0.9952, Recall: 0.9992, Accuracy: 0.9972, F1 Score: 0.9972


Fold 5 - Precision: 0.9957, Recall: 0.9992, Accuracy: 0.9974, F1 Score: 0.9974


Average Precision: 0.9955, Average Recall: 0.9992, Average Accuracy: 0.9973, Average F1 Score: 0.9973


In [32]:
import plotly.graph_objs as go

# Plot the ROC curve for each fold using Plotly
roc_traces = []
for i, (fpr_fold, tpr_fold) in enumerate(zip(fprs_padded, tprs_padded)):
    # Sort FPR and TPR for this fold
    sorted_indices_fold = np.argsort(fpr_fold)
    fpr_fold_sorted = fpr_fold[sorted_indices_fold]
    tpr_fold_sorted = tpr_fold[sorted_indices_fold]
    
    # Calculate AUC for this fold
    auc_fold = auc(fpr_fold_sorted, tpr_fold_sorted)
    
    # Create a trace for this fold
    roc_trace_fold = go.Scatter(x=fpr_fold_sorted, y=tpr_fold_sorted,
                                mode='lines',
                                line=dict(width=1),
                                name=f'Fold {i+1} (AUC = {auc_fold:.8f})')
    roc_traces.append(roc_trace_fold)

# Add the random guess line
random_guess_trace = go.Scatter(x=[0, 1], y=[0, 1], 
                                mode='lines', 
                                line=dict(color='gray', dash='dash', width=1),
                                name='Random Guess')

layout = go.Layout(title='Receiver Operating Characteristic (ROC) Curve - Cross-validation',
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'),
                   legend=dict(x=0.05, y=0.95, bgcolor='rgba(255, 255, 255, 0.5)', bordercolor='rgba(0, 0, 0, 0.5)'))

fig = go.Figure(data=roc_traces + [random_guess_trace], layout=layout)
fig.show()


In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Encode categorical variables
label_encoder = LabelEncoder()
val_df['category'] = label_encoder.fit_transform(val_df['category'])
val_df['gender'] = label_encoder.fit_transform(val_df['gender'])
val_df['age'] = label_encoder.fit_transform(val_df['age'])
val_df['customer'] = label_encoder.fit_transform(val_df['customer'])
val_df['merchant'] = label_encoder.fit_transform(val_df['merchant'])
val_df['amount'] = label_encoder.fit_transform(val_df['amount'])

In [47]:
#min max scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_data_val = scaler.fit_transform(val_df)
# Create a new DataFrame with the normalized data
normalized_df_val = pd.DataFrame(normalized_data_val, columns=val_df.columns)

In [48]:
normalized_df_val

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0.357542,0.744759,0.142857,0.5,0.612245,0.857143,0.363487,0.0
1,0.262570,0.841297,0.714286,1.0,0.367347,0.857143,0.415236,0.0
2,0.184358,0.342760,0.428571,1.0,0.367347,0.857143,0.383469,0.0
3,0.407821,0.391273,0.428571,0.5,0.612245,0.857143,0.217452,0.0
4,0.418994,0.392004,0.571429,0.5,0.612245,0.857143,0.080393,0.0
...,...,...,...,...,...,...,...,...
118924,0.363128,0.308386,0.285714,1.0,0.612245,0.857143,0.321417,0.0
118925,0.592179,0.219161,0.285714,0.5,0.142857,0.357143,0.812129,0.0
118926,0.167598,0.990005,0.285714,0.5,0.795918,0.285714,0.578754,0.0
118927,0.301676,0.306436,0.285714,0.5,0.489796,1.000000,0.400718,0.0


In [49]:
import joblib
# Load the saved model
loaded_model = joblib.load('best_xgboost_model.pkl')

X_val = normalized_df_val.drop('fraud', axis=1)  # Adjust 'target_column_name' to your target column name
y_val = normalized_df_val['fraud']  # Adjust 'target_column_name' to your target column name

# Make predictions on the validation dataset
y_pred = loaded_model.predict(X_val)

# Evaluate the model performance on the validation dataset
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Plot Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_pred)
# Define the labels for the confusion matrix cells
labels = [['TN', 'FP'], ['FN', 'TP']]

# Define the text for the confusion matrix cells
text = [[str(conf_matrix[0, 0]), str(conf_matrix[0, 1])],
        [str(conf_matrix[1, 0]), str(conf_matrix[1, 1])]]

# Create a heatmap figure using Plotly
fig = ff.create_annotated_heatmap(z=conf_matrix, x=['Predicted Negative', 'Predicted Positive'],
                                  y=['Actual Negative', 'Actual Positive'], annotation_text=text, colorscale='Blues')

# Update the layout of the figure
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted Labels', yaxis_title='True Labels')

# Add cell labels to the heatmap
for i in range(len(labels)):
    for j in range(len(labels[0])):
        fig.add_annotation(text=labels[i][j], x=['Predicted Negative', 'Predicted Positive'][j], y=['Actual Negative', 'Actual Positive'][i],
                           font=dict(color='black', size=14))

# Show the plot
fig.show()

Validation Accuracy: 0.971243346870822
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99    117426
         1.0       0.30      0.98      0.46      1503

    accuracy                           0.97    118929
   macro avg       0.65      0.98      0.72    118929
weighted avg       0.99      0.97      0.98    118929

Confusion Matrix:
[[114029   3397]
 [    23   1480]]
