In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Steps

0. Data Visualization and statistics 

1. Data Preprocessing : handling missing values, scaling features, and encoding categorical variables
2. Data Splitting: training, and testing sets (x_train and test, y_train and test)
3. Feature Selection/Engineering
4. Model Selection:  machine learning algorithms for fraud detection
> * Neural Networks
> * Logistic Regression
> * Decision Trees
> * Random Forests
> * Gradient Boosting (e.g., XGBoost, LightGBM)
5. Model Training
6. Model Evaluation:accuracy, precision, recall, F1-score, and ROC-AUC. 
7. Handling Imbalance: oversampling the minority class, undersampling the majority class, Synthetic Minority Over-sampling Technique (SMOTE)
8. Hyperparameter Tuning

In [None]:
# load dataset to pandas dataframe
df = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")
# shows the valuse of top rows, which gives more idea on data types. 
df.head()

# **Data Visualization**

Doing some basic statistic to visualize the data:
Basic Stats

>* describe() shows a summary of numerial features.
>* value_counts()  generate a summary of categorical features.

In [None]:
df.describe()

In [None]:
# histogram
df.hist(bins=50, figsize=(15, 15))

In [None]:
# exploring the dataset
df.info()

In [None]:
# count null values in each attribute (column)
df.isnull().sum()
# no null values in the date 
# in case there is any null values we can treate it by filling it with the mean value as below 
# df.fillna(df.mean(), inplace=True)

The target is the calss, so lets do some statistcs on it

In [None]:
df['Class'].value_counts()


In [None]:
prc = 492 / 284315
print('The % of fraud to normal transaction is:'+ str(prc))
print('data is unbalanced')

#  Splitting Data x_train, y_train / x_test, y_test

1. split the features from the target into two data sets x: for features, y: for target
2. split the data sets into training data set to train the model and test data set to test the model 

In [None]:
y = df['Class']
x = df.drop(['Class'], axis = 1)
y.head()


In [None]:
x.head()

# Feature Selection

**Correlation Analysis**: in this we will study the correlation between each feature with the target

In [None]:
# from pandas.plotting import scatter_matrix
# attributes = ['Time', 'V1', 'V2', 'V3','V4', 'V5', 'Amount','Class']
# scatter_matrix(df[attributes], figsize=(12,12)) 

In [None]:
import scipy.stats

pearson_correlations = []
for column in x.columns:
    pearson_corr, _ = scipy.stats.pearsonr(x[column], y)
    pearson_correlations.append((column, pearson_corr))
    
print("Pearson Correlations:")
for feature, corr in pearson_correlations:
    print(f"{feature}: {corr:.4f}")    

We can see that the following have high correlation with the target: Class:

1. V1: -0.1013
2. **V3: -0.1930**
3. V4: 0.1334
4. **V10: -0.2169**
5. V11: 0.1549
6. **V12: -0.2606**
7. **V14: -0.3025**
4. **V16: -0.1965**
5. **V17: -0.3265**
6. V18: -0.1115


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


**t-test - Feature Selection** : in this step, the t-test was applied on each continuous feature with the binary target 'Class' to assess the relationship for each.

In [None]:
t_test = []
for column in x.columns:
    fraud_values = x[column][y == 1]
    non_fraud_values = x[column][y == 0]
    t_statistic, p_value = scipy.stats.ttest_ind(fraud_values, non_fraud_values)
    t_test.append((column, t_statistic, p_value))

# Print t-test results
print("T-Test Results:")
for feature, t_statistic, p_value in t_test:
    print(f"{feature}: t-statistic={t_statistic:.4f}, p-value={p_value:.4f}")
    

**p-value explanation:**
if p_value is less than the significance level (common 0.05) this means that the feature is potentially informative and we can select it in our model

**features to execlude because of high p-value:**
1. V22: t-statistic=0.4298, **p-value=0.6674**
2. V23: t-statistic=-1.4330, **p-value=0.1519**
3. V25: t-statistic=1.7652, **p-value=0.0775**
 

# Data Preprocessing - Scaling Features

The data has high and low mean so the following step will normalize the data.

**features (column) with high mean can dominate the learning process.  **

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# convert from array to panda data fram
x_train_df = pd.DataFrame(x_train)
x_train_scaled_df = pd.DataFrame(x_train_scaled)
x_test_scaled_df = pd.DataFrame(x_test_scaled)

x_train_scaled_df.describe()

In [None]:
x_train_df.describe()

As shown above x_train_df statistics differ from x_train_sacled_df after transforming the data  

In [None]:
x_test_scaled_df.describe()

# Oversampling

In our data set we have the following counts for the results for our target "Class":

0:    **284315**  - non fraud

1:       **492**  - fraud


we will use it in the mdel and test the accuracy


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
x_train_scaled_oversampled, y_train_oversampled = smote.fit_resample(x_train_scaled, y_train)


# Undersampling

we will use also undersampling in the model and test the accuracy and take the better model


In [None]:
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=42)
X_train_scaled_undersampled, y_train_undersampled = under_sampler.fit_resample(x_train_scaled, y_train)


    # **Neural Network**

here we used two activation function:
1. ReLU (Rectified Linear Activation):f(x) = max(0, x) used in hidden layers to address the vanishing gradient problem
2. Sigmoid Activation: f(x) = 1 / (1 + exp(-x)) for target prediction

** we used all the feature in this model as NN can handle large number of inputs 


># Model 1: NN with Oversampling - Using All features

In [None]:
from keras.models import Sequential
from keras.layers import Dense

# step1: build the model
model_nn_oversampling = Sequential()
model_nn_oversampling.add(Dense(64, activation='relu', input_shape=(x_train_scaled_oversampled.shape[1],)))
model_nn_oversampling.add(Dense(32, activation='relu'))
model_nn_oversampling.add(Dense(1, activation='sigmoid'))

#Step2: compile the model
model_nn_oversampling.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Step3: train the model
training_nn_oversampling_history = model_nn_oversampling.fit(x_train_scaled_oversampled, y_train_oversampled, epochs=10, batch_size=32, validation_split=0.1)

#Step4: Evaluate the model
loss, accuracy = model_nn_oversampling.evaluate(x_test_scaled, y_test)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")







In this model: the **Test loss value of 0.0179** indicates that the model's predictions are closer to the actual outcomes.

The **Accuracy of 0.9988** reflect statistics on correctly predicted instances




> **Plot the loss to detect overfitting**

In [None]:
# Extract training and validation loss values from the training_nn_oversampling_history
train_nn1_loss = training_nn_oversampling_history.history['loss']
val_nn1_loss = training_nn_oversampling_history.history['val_loss']

# Extract training and validation accuracy values from the training_nn_oversampling_history
train_nn1_acc = training_nn_oversampling_history.history['accuracy']
val_nn1_acc = training_nn_oversampling_history.history['val_accuracy']

In [None]:
import matplotlib.pyplot as plt

# Plot Loss

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, 11), train_nn1_loss, label='Train Loss')
plt.plot(range(1, 11), val_nn1_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Plot accuracy 
plt.subplot(1, 2, 2)
plt.plot(range(1, 11), train_nn1_acc, label='Train Accuracy')
plt.plot(range(1, 11), val_nn1_acc, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

**Conclusion** 

From the plot above: 

1. Both the training and validation loss are decreasing over the epochs, it indicates that the model is learning effectively. 

2. Both training and validation accuracy are increasing, it's a positive sign that the model is learning effectively

In [None]:
# step 5: predect values 
predictions = model_nn_oversampling.predict(x_test_scaled)
print(predictions)

#convert results to binary
print("**** As binary Output ******")
binary_predictions = (predictions >= 0.5).astype(int)
print(binary_predictions)

> **NN Oversampling Confusion Matrix** 

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate confusion matrix
conf_matrix_nn1 = confusion_matrix(y_test, binary_predictions)

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_nn1, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - NN Oversampling')
plt.xticks([0.5, 1.5], ['Non-Fraud', 'Fraud'])
plt.yticks([0.5, 1.5], ['Non-Fraud', 'Fraud'])
plt.show()

># **Model 2: NN with Undersampling - Using All features **

In [None]:
# step1: build the model
model_nn_undersampling = Sequential()
model_nn_undersampling.add(Dense(64, activation='relu', input_shape=(X_train_scaled_undersampled.shape[1],)))
model_nn_undersampling.add(Dense(32, activation='relu'))
model_nn_undersampling.add(Dense(1, activation='sigmoid'))

#Step2: compile the model
model_nn_undersampling.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Step3: train the model
training_nn_undersampling_history = model_nn_undersampling.fit(X_train_scaled_undersampled, y_train_undersampled, epochs=10, batch_size=32, validation_split=0.1)

#Step4: Evaluate the model
loss, accuracy = model_nn_undersampling.evaluate(x_test_scaled, y_test)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")

In this model: the Test loss value is 0.1102 which is greater than the first model, this means 

The Accuracy of 0.9702 which is less than the first model

> **First Conclusion**

> *model_nn_oversampling is better than model_nn_undersampling*

In [None]:
# step 5: predect values 
predictions_nn_undersampling = model_nn_undersampling.predict(x_test_scaled)
print(predictions_nn_undersampling)

#convert results to binary
print("**** As binary Output ******")
binary_undersampling_predictions = (predictions_nn_undersampling >= 0.5).astype(int)
print(binary_undersampling_predictions)

> **Plot the loss to detect overfitting**

In [None]:
# Extract training and validation loss values from the training_nn_oversampling_history
train_nn2_loss = training_nn_undersampling_history.history['loss']
val_nn2_loss = training_nn_undersampling_history.history['val_loss']

# Extract training and validation accuracy values from the training_nn_oversampling_history
train_nn2_acc = training_nn_undersampling_history.history['accuracy']
val_nn2_acc = training_nn_undersampling_history.history['val_accuracy']

In [None]:
# Plot Loss

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, 11), train_nn2_loss, label='Train Loss')
plt.plot(range(1, 11), val_nn2_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss - undersampling')
plt.legend()

# Plot accuracy 
plt.subplot(1, 2, 2)
plt.plot(range(1, 11), train_nn2_acc, label='Train Accuracy')
plt.plot(range(1, 11), val_nn2_acc, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy - undersampling')
plt.legend()

plt.tight_layout()
plt.show()

> **Second Conclusion** 

From the plot above: 

1. Both the training and validation loss are decreasing over the epochs, it indicates that the model is learning effectively. 

2. Both training and validation accuracy are increasing, it's a positive sign that the model is learning effectively

> **NN Undersampling Confusion Matrix** 


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate confusion matrix
conf_matrix_nn2 = confusion_matrix(y_test, binary_undersampling_predictions)

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_nn2, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.xticks([0.5, 1.5], ['Non-Fraud', 'Fraud'])
plt.yticks([0.5, 1.5], ['Non-Fraud', 'Fraud'])
plt.show()

# Random Forests

In this part we will test another ML model and compare it with the NN models 

Steps below are followed to build and apply Random Forests model:
** Note (data splitting )

1. Splitting the data without scaling it because random forest algorithm depends on decision trees and relative comparisons between features. 
2. Feature Selection: using scikit-learn's SelectFromModel
3. Building and Training the Model
4. Evaluate the model


***Note*** no need for oversampling or undersampling in Random Forest model, this model is able to handle unbalanced data.


In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
selector = SelectFromModel(rf_selector)

selector.fit(x_train, y_train)

In [None]:
selected_mask = selector.get_support()
# Get the column names of selected features
selected_columns = x_train.columns[selected_mask]
print(selected_columns)

# Transform training and testing data to selected features
x_train_rf_selected = selector.transform(x_train)
x_test_rf_selected = selector.transform(x_test)



In [None]:
from sklearn.metrics import classification_report

# Build the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Train the model 
random_forest.fit(x_train_rf_selected, y_train)


In [None]:
# Evaluate the RF model 
y_pred = random_forest.predict(x_test_rf_selected)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.xticks([0.5, 1.5], ['Non-Fraud', 'Fraud'])
plt.yticks([0.5, 1.5], ['Non-Fraud', 'Fraud'])
plt.show()