In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
df= pd.read_csv('fraud.csv')
df = df.map(lambda x: x.replace("'", "") if isinstance(x, str) else x)
df['gender'].fillna(df['gender'].mode()[0], inplace=True)
df.loc[df['gender'] == 'E', 'age'] = '7'
df['age']=df['age'].astype(int)

In [3]:
# Assuming "step" is a column in your DataFrame df
# Normalize the "step" attribute, circular encoding or circular representation, 
# and it helps LSTM models effectively capture periodic patterns in the data.
normalized_step = df['step'] / 179 * 2 * np.pi
df['step_sin'] = np.sin(normalized_step)
df['step_cos'] = np.cos(normalized_step)
df.drop(columns=['zipcodeOri','zipMerchant','step'], inplace=True)

In [4]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
label_encoder = LabelEncoder()
train_df['category'] = label_encoder.fit_transform(train_df['category'])
train_df['gender'] = label_encoder.fit_transform(train_df['gender'])
train_df['customer'] = label_encoder.fit_transform(train_df['customer'])
train_df['merchant'] = label_encoder.fit_transform(train_df['merchant'])
train_df['age']=train_df['age'].astype(int)
train_df['fraud']=train_df['fraud'].astype(int)
train_df = train_df[['customer','age','gender','merchant','category','step_sin','step_cos','amount','fraud']]

In [7]:
#min max scaler
scaler = MinMaxScaler()
normalized_data_tr = scaler.fit_transform(train_df)
normalized_df_tr = pd.DataFrame(normalized_data_tr, columns=train_df.columns)

In [None]:
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC

X = normalized_df_tr.drop('fraud', axis=1)  # Features
y = normalized_df_tr['fraud']  # Target variable

sampling_strategy = 'auto'
print("SMOTENN Sampling Strategy = auto")
categorical_features = [ 0,1, 2, 3, 4]

# Apply SMOTEENN to the dataset with specified parameters
smoteenn = SMOTEENN(sampling_strategy=sampling_strategy, smote=SMOTENC(categorical_features=categorical_features, random_state=42))
X_resampled, y_resampled = smoteenn.fit_resample(X, y)
resampled_df = pd.DataFrame(X_resampled, columns=normalized_df_tr.columns[:-1])  # Assuming normalized_df_tr has column names
resampled_df['fraud'] = y_resampled

In [41]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from keras.callbacks import ModelCheckpoint

# Convert DataFrame to numpy array
data = resampled_df.values

# Split data into features (X) and target variable (y)
X = data[:, :-1]  # All columns except the last one
y = data[:, -1]   # Last column

# Reshape the data to fit the input shape of the LSTM model
# LSTM expects input data in the shape (samples, time steps, features)
# Here, we assume each row represents a single time step
X = X.reshape(X.shape[0], 1, X.shape[1])

# Define the number of folds for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True)

# Initialize lists to store evaluation metrics across folds
losses = []
accuracies = []
confusion_matrices = []

# Perform cross-validation
for fold_index, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]),return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(50))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Define model checkpoint to save the best model based on validation loss
    checkpoint_filepath = f"best_model_fold_{fold_index}.h5"
    model_checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False,
        monitor='val_loss',
        mode='min',
        save_best_only=True)
    
    # Train the model with model checkpoint callback
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0, validation_data=(X_test, y_test), callbacks=[model_checkpoint_callback])
    
    # Load the best model based on validation loss
    model.load_weights(checkpoint_filepath)
    
    # Evaluate the best model
    loss, accuracy = model.evaluate(X_test, y_test)
    losses.append(loss)
    accuracies.append(accuracy)
    
    # Predict probabilities for the test data
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int).flatten()
    y_test_int = y_test.astype(int)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test_int, y_pred)
    confusion_matrices.append(cm)

# Compute mean evaluation metrics across folds
mean_loss = np.mean(losses)
mean_accuracy = np.mean(accuracies)
mean_cm = np.mean(confusion_matrices, axis=0)

print("Mean Test Loss:", mean_loss)
print("Mean Test Accuracy:", mean_accuracy)
print("Mean Confusion Matrix:")
print(mean_cm)

# Train the final model on the entire dataset
final_model = Sequential()
final_model.add(LSTM(50, input_shape=(X.shape[1], X.shape[2]),return_sequences=True))
final_model.add(Dropout(0.2))
final_model.add(LSTM(50))
final_model.add(Dense(1, activation='sigmoid'))
final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
final_model.fit(X, y, epochs=10, batch_size=32, verbose=0)

# Save the final model
final_model.save('best_lstm_model.h5')


Mean Test Loss: 0.0209112960845232
Mean Test Accuracy: 0.9919394254684448
Mean Confusion Matrix:
[[90396.4   861.6]
 [  624.  92422.4]]


In [53]:
import plotly.figure_factory as ff
import numpy as np

# Define lists to store precision, recall, accuracy, and F1 score for each fold
precisions = []
recalls = []
accuracies = []
f1_scores = []

# Plot confusion matrix for each fold using Plotly
for i, conf_matrix in enumerate(confusion_matrices):
    # Calculate precision, recall, accuracy, and F1 score
    tp = conf_matrix[1, 1]
    fp = conf_matrix[0, 1]
    fn = conf_matrix[1, 0]
    tn = conf_matrix[0, 0]
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)
    f1_scores.append(f1_score)
    
    # Print precision, recall, accuracy, and F1 score for the current fold
    print(f"Fold {i+1} - Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, F1 Score: {f1_score:.4f}")
    
    # Plot confusion matrix
    fig = ff.create_annotated_heatmap(z=conf_matrix, x=['Predicted Negative', 'Predicted Positive'],
                                      y=['Actual Negative', 'Actual Positive'], annotation_text=conf_matrix.tolist(),
                                      colorscale='Blues')
    fig.update_layout(title=f'Confusion Matrix - Fold {i+1}', xaxis_title='Predicted Labels', yaxis_title='True Labels')
    fig.show()

# Calculate and print average precision, recall, accuracy, and F1 score across all folds
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_accuracy = np.mean(accuracies)
avg_f1_score = np.mean(f1_scores)
print(f"Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}, Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1_score:.4f}")


Fold 1 - Precision: 0.9907, Recall: 0.9931, Accuracy: 0.9918, F1 Score: 0.9919


Fold 2 - Precision: 0.9899, Recall: 0.9951, Accuracy: 0.9924, F1 Score: 0.9925


Fold 3 - Precision: 0.9910, Recall: 0.9929, Accuracy: 0.9919, F1 Score: 0.9920


Fold 4 - Precision: 0.9912, Recall: 0.9921, Accuracy: 0.9915, F1 Score: 0.9916


Fold 5 - Precision: 0.9910, Recall: 0.9932, Accuracy: 0.9921, F1 Score: 0.9921


Average Precision: 0.9908, Average Recall: 0.9933, Average Accuracy: 0.9919, Average F1 Score: 0.9920


In [60]:
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

# Plot ROC curve for each fold
fig = go.Figure()

for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train your LSTM model here
    # Replace this with your LSTM model training code

    # Make predictions on the test set
    y_pred_prob = model.predict(X_test)

    # Calculate false positive rate (fpr) and true positive rate (tpr)
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)

    # Calculate AUC
    roc_auc = auc(fpr, tpr)

    # Plot the ROC curve for the current fold
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Fold {i+1} (AUC = {roc_auc:})'))

# Plot random guess line
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name='Random Guess'))

# Update layout
fig.update_layout(title='Receiver Operating Characteristic (ROC)',
                  xaxis_title='False Positive Rate',
                  yaxis_title='True Positive Rate',
                  legend=dict(x=0.01, y=0.99),
                  margin=dict(l=0, r=0, t=30, b=0))

fig.show()




In [61]:
val_df['category'] = label_encoder.fit_transform(val_df['category'])
val_df['gender'] = label_encoder.fit_transform(val_df['gender'])
val_df['customer'] = label_encoder.fit_transform(val_df['customer'])
val_df['merchant'] = label_encoder.fit_transform(val_df['merchant'])
val_df['age']=val_df['age'].astype(int)
val_df['fraud']=val_df['fraud'].astype(int)

In [62]:
val_df = val_df[['customer','age','gender','merchant','category','step_sin','step_cos','amount','fraud']]

In [63]:
#min max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_data_val = scaler.fit_transform(val_df)
# Create a new DataFrame with the normalized data
normalized_df_val = pd.DataFrame(normalized_data_val, columns=val_df.columns)

In [64]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.models import load_model
# Load the pre-trained LSTM model
loaded_model = load_model('best_model_fold_2.h5')

# Assuming X_val and y_val are your validation data
# Extract features (X_val) and target variable (y_val) from the validation DataFrame (val_df)
X_val = normalized_df_val.drop('fraud', axis=1)
y_val = normalized_df_val['fraud']

# Reshape the validation data to match the input shape expected by the LSTM model
X_val_reshaped = X_val.values.reshape(X_val.shape[0], 1, X_val.shape[1])
y_pred_prob = loaded_model.predict(X_val_reshaped)

# Predict probabilities for each class using the loaded LSTM model
y_pred_prob = loaded_model.predict(X_val_reshaped)

# Convert probabilities to class labels based on a threshold (e.g., 0.5)
y_pred = (y_pred_prob > 0.5).astype(int)

# Flatten the predictions to get a 1D array
y_pred = y_pred.flatten()

# Define y_true as the true labels corresponding to the validation set
y_true = y_val

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Calculate precision
precision = precision_score(y_true, y_pred)

# Calculate recall
recall = recall_score(y_true, y_pred)

# Calculate F1-score
f1 = f1_score(y_true, y_pred)

# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:")
print(cm)



import plotly.figure_factory as ff

# Define the labels for the confusion matrix cells
labels = [['TN', 'FP'], ['FN', 'TP']]

# Define the text for the confusion matrix cells
text = [[str(cm[0, 0]), str(cm[0, 1])],
        [str(cm[1, 0]), str(cm[1, 1])]]

# Create a heatmap figure using Plotly
fig = ff.create_annotated_heatmap(z=cm, x=['Non-fraudulent', 'Fraudulent'],
                                  y=['Non-fraudulent', 'Fraudulent'], annotation_text=text, colorscale='Blues')

# Update the layout of the figure
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted Labels', yaxis_title='True Labels')

# Add cell labels to the heatmap
for i in range(len(labels)):
    for j in range(len(labels[0])):
        fig.add_annotation(text=labels[i][j], x=['Non-fraudulent', 'Fraudulent'][j], y=['Non-fraudulent', 'Fraudulent'][i],
                           font=dict(color='black', size=14))

# Show the plot
fig.show()

Accuracy: 0.9730343314078147
Precision: 0.30355575065847235
Recall: 0.9760056457304164
F1-score: 0.4630838774485183
Confusion Matrix:
[[114339   3173]
 [    34   1383]]
