In [None]:
pip install --upgrade scikit-learn

In [None]:
pip install --upgrade imbalanced-learn

In [None]:
pip install xgboost

In [36]:
# Import required libraries
import pandas as pd
import sklearn
import imblearn
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns


from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# load the dataset
df = pd.read_csv('financial_transactions.csv')

In [3]:
df.head()

In [None]:
# column and variable types
df.info()
df.describe()

In [None]:
# check for missing values
df.isnull().sum()

In [None]:
# check for any imbalance
df['isFraud'].value_counts()

In [None]:
# One Hot Encoding for categorical features
df = pd.get_dummies(df, columns=['type'], drop_first=True)
print(df.head())

In [8]:
# Selecting columns to scale
numerical_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                     'oldbalanceDest', 'newbalanceDest']

In [9]:
# Standard scaling for numerical features
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [None]:
df.head()

In [11]:
df = df.drop(['nameOrig', 'nameDest','isFlaggedFraud'], axis=1) # dropping unique identifiers

# split dataset into training and testing sets
X = df.drop('isFraud', axis=1)  
y = df['isFraud']  

In [None]:
# Create the count plot with a logarithmic scale on the y-axis
sns.countplot(x='isFraud', data=df)
plt.yscale('log')
plt.savefig('fraud_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Calculate correlation matrix
corr_matrix = df.corr()
# Create heatmap for correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.savefig('Correlation.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create box plot 
plt.figure(figsize=(10, 6))
sns.boxplot(x='isFraud', y='amount', data=df)
plt.yscale('log')  
plt.savefig('Boxplot.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Create scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='oldbalanceOrg', y='amount', hue='isFraud', data=df)
plt.savefig('scatterplot.png', dpi=300, bbox_inches='tight')
plt.show()


In [16]:
# split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# check the shapes of the resulting datasets
print(f"Training Set Shape: {X_train.shape}, {y_train.shape}")
print(f"Test Set Shape: {X_test.shape}, {y_test.shape}")

In [None]:
# handling class imbalance
smote = SMOTE(random_state=42)

# Apply SMOTE to only the training set
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f'Original class distribution: {Counter(y_train)}')
print(f'Resampled class distribution: {Counter(y_resampled)}')

In [None]:
# Feature names (for visualization purposes, you can replace these with actual feature names from your dataset)
feature_names = [f"feature_{i}" for i in range(X.shape[1])]

In [None]:
# Feature scaling 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

# Initialise the Lasso model 
lasso = Lasso(alpha=0.01)

# Fit the Lasso model to the training data
lasso.fit(X_train_scaled, y_train)

# Get the coefficients of the features
lasso_coefficients = lasso.coef_

# Convert to a DataFrame to view feature coefficients
feature_importances = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso_coefficients})

# Sort the features by their importance (absolute value of coefficients)
feature_importances = feature_importances.reindex(feature_importances['Coefficient'].abs().sort_values(ascending=False).index)

In [None]:
# Plot the feature importances
plt.figure(figsize=(8, 6))
plt.barh(feature_importances['Feature'], feature_importances['Coefficient'])
plt.savefig('feature_selection.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Initialise StratifiedKFold for cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Listof models
models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("XGBoost", xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
]

# Function to evaluate the model
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, auc

# Loop through each model
for name, model in models:
    print(f"\nTraining and evaluating: {name}")

    # Perform cross validation
    cross_val_scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='f1')
    print(f"{name} - Mean F1 Score from CV: {cross_val_scores.mean()}")

    # Fit the model 
    model.fit(X_resampled, y_resampled)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy, precision, recall, f1, auc = evaluate_model(y_test, y_pred)
    
    # Print evaluation metrics
    print(f"Test Set Evaluation for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")

In [40]:
# Import necessary libraries
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import numpy as np

In [41]:
# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],                 
    'max_depth': [3, 6, 9],                           
    'learning_rate': [0.01, 0.1, 0.2],                
    'subsample': [0.6, 0.8, 1.0],                    
    'colsample_bytree': [0.6, 0.8, 1.0],              
    'gamma': [0, 0.1, 0.3],                           
    'reg_alpha': [0, 0.01, 0.1],                      
    'reg_lambda': [1, 0.1, 0.01]                      
}


In [42]:
# Initialise the XGBoost classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Initialise the RandomizedSearchCV 
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=10,                      
    scoring='f1',                   
    cv=5,                           
    verbose=1,                      
    n_jobs=-1,                      
    random_state=42                 
)

In [None]:
# Fit the RandomizedSearchCV 
random_search.fit(X_resampled, y_resampled)

# Get the best estimator
best_xgb = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Print the results
print(f"Best Hyperparameters: {random_search.best_params_}")
print(f"Test Set Evaluation with Tuned XGBoost:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")


In [None]:
# Import necessary libraries
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# 1. Fit Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_resampled, y_resampled)

# 2. Fit Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# 3. Fit XGBoost (with tuned hyperparameters)
best_xgb = xgb.XGBClassifier(
    use_label_encoder=False, eval_metric='logloss',
    subsample=0.8, reg_lambda=0.01, reg_alpha=0, n_estimators=300,
    max_depth=9, learning_rate=0.1, gamma=0, colsample_bytree=0.6
)
best_xgb.fit(X_train, y_train)


# Function to plot ROC curve for a single model
def plot_roc_curve(model, X_test, y_test, model_name):
    # Get predicted probabilities for the positive class (fraudulent transactions)
    y_probs = model.predict_proba(X_test)[:, 1]  # Get the probabilities of the positive class
    
    # Calculate False Positive Rate (FPR), True Positive Rate (TPR), and thresholds
    fpr, tpr, thresholds = roc_curve(y_test, y_probs)
    
    # Calculate AUC (Area Under Curve)
    auc = roc_auc_score(y_test, y_probs)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.4f})')

# Plot ROC curves for all models on the same plot
plt.figure(figsize=(10, 8))

# List of models with names for easier plotting
models = [
    ("Logistic Regression", log_reg), 
    ("Random Forest", rf_model), 
    ("XGBoost", best_xgb)
]

# Loop through each model and plot the ROC curve
for model_name, model in models:
    plot_roc_curve(model, X_test, y_test, model_name)

# Plot diagonal line for random guessing (AUC = 0.5)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--', label='Random Guessing (AUC = 0.5000)')

# Set plot labels and title
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison for Logistic Regression, Random Forest, and XGBoost')
plt.legend(loc='lower right')
plt.show()
