In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### The data frame used is already scaled and can be downloaded by running the code from the final juptyr notebook file.

In [2]:
df = pd.read_csv('Datasets/Base.csv')

# RANDOM FOREST MODEL

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Count the number of instances for each class
class_counts = df['fraud_bool'].value_counts()

# Determine the number of instances for the minority class (1s)
num_minority = class_counts.min()

# Separate majority and minority classes
df_majority = df[df['fraud_bool'] == 0]
df_minority = df[df['fraud_bool'] == 1]

# Randomly undersample the majority class
df_majority_undersampled = resample(df_majority, 
                                     replace=False,  # do not want to replace
                                     n_samples=num_minority,  # match minority class
                                     random_state=42)  # reproducible results

# Combine undersampled majority class with original minority class
df_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the new DataFrame (optional)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and target variable
X = df_balanced.drop('fraud_bool', axis=1)  # Features
y = df_balanced['fraud_bool']  # Target variable

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model with balanced class weights
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      3335
           1       0.80      0.79      0.79      3283

    accuracy                           0.80      6618
   macro avg       0.80      0.80      0.80      6618
weighted avg       0.80      0.80      0.80      6618



# RANDOM FOREST (SMOTE)

In [4]:

# Select features to use in anomaly detection
features = ['intended_balcon_amount', 'velocity_6h', 'velocity_24h', 'customer_age']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

# Select features and target
X = df[features]
y = df['fraud_bool']

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model with class weight
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98    296961
           1       0.98      0.97      0.98    296422

    accuracy                           0.98    593383
   macro avg       0.98      0.98      0.98    593383
weighted avg       0.98      0.98      0.98    593383



# RANDOM FOREST 

In [7]:

# Step 2: Define Features and Target
X = df.drop(columns=["fraud_bool"])  # Features
y = df["fraud_bool"]  # Target

# Step 3: Check Unique Values in Target
print("Unique values in y:", y.unique())

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Step 5: One-Hot Encoding of Categorical Variables
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure both training and testing sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Step 6: Scaling the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame if needed
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Step 7: Fit the Random Forest Model
rf_model = RandomForestClassifier(n_estimators=500, class_weight='balanced', random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Step 8: Evaluate the Model
y_pred = rf_model.predict(X_test_scaled)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Unique values in y: [0 1]
Accuracy: 0.98926
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    247320
           1       0.31      0.00      0.00      2680

    accuracy                           0.99    250000
   macro avg       0.65      0.50      0.50    250000
weighted avg       0.98      0.99      0.98    250000

Confusion Matrix:
 [[247311      9]
 [  2676      4]]


# LINEAR REGRESSION

In [8]:

# Step 2: Define Features and Target
X = df.drop(columns=["fraud_bool"])  # Features
y = df["fraud_bool"]  # Target

# Step 3: Check Unique Values in Target
print("Unique values in y:", y.unique())

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Step 5: One-Hot Encoding of Categorical Variables
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure both training and testing sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Step 6: Scaling the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Fit the Linear Regression Model
lin_model = LinearRegression()
lin_model.fit(X_train_scaled, y_train)

# Step 8: Predict
y_pred_lin = lin_model.predict(X_test_scaled)

# Note: Predictions will be continuous; you might need to threshold them for binary classification
threshold = 0.5
y_pred_binary = (y_pred_lin > threshold).astype(int)

# Evaluate
print("Accuracy (Linear Regression):", accuracy_score(y_test, y_pred_binary))
print("Classification Report (Linear Regression):\n", classification_report(y_test, y_pred_binary))

Unique values in y: [0 1]
Accuracy (Linear Regression): 0.98928
Classification Report (Linear Regression):
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    247320
           1       0.00      0.00      0.00      2680

    accuracy                           0.99    250000
   macro avg       0.49      0.50      0.50    250000
weighted avg       0.98      0.99      0.98    250000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# GRADIENT BOOSTING

In [9]:

# Step 2: Define Features and Target
X = df.drop(columns=["fraud_bool"])  # Adjust column name as needed
y = df["fraud_bool"]

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Step 4: One-Hot Encoding of Categorical Variables
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure both training and testing sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Step 5: Scaling the Data (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 6: Create and Fit the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=100, random_state=42)
gb_clf.fit(X_train_scaled, y_train)

# Step 7: Make Predictions
y_pred_train = gb_clf.predict(X_train_scaled)
y_pred_test = gb_clf.predict(X_test_scaled)

# Step 8: Evaluate the Model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print('Train Accuracy (Gradient Boosting): {:.2f}'.format(train_accuracy))
print('Test Accuracy (Gradient Boosting): {:.2f}'.format(test_accuracy))
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test))

Train Accuracy (Gradient Boosting): 0.99
Test Accuracy (Gradient Boosting): 0.99
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    247243
           1       0.62      0.02      0.04      2757

    accuracy                           0.99    250000
   macro avg       0.80      0.51      0.52    250000
weighted avg       0.99      0.99      0.98    250000



# GRADIENT BOOSTING (SMOTE)

In [10]:
X = df.drop(columns=["fraud_bool"])  # Adjust column name as needed
y = df["fraud_bool"]

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Step 4: One-Hot Encoding of Categorical Variables
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure both training and testing sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Step 5: Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 6: Scaling the Data (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Step 7: Create and Fit the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=100, random_state=42)
gb_clf.fit(X_train_scaled, y_train_resampled)

# Step 8: Make Predictions
y_pred_train = gb_clf.predict(X_train_scaled)
y_pred_test = gb_clf.predict(X_test_scaled)

# Step 9: Evaluate the Model
train_accuracy = accuracy_score(y_train_resampled, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print('Train Accuracy (Gradient Boosting): {:.2f}'.format(train_accuracy))
print('Test Accuracy (Gradient Boosting): {:.2f}'.format(test_accuracy))
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test))

Train Accuracy (Gradient Boosting): 0.95
Test Accuracy (Gradient Boosting): 0.95
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.99      0.95      0.97    247243
           1       0.08      0.36      0.13      2757

    accuracy                           0.95    250000
   macro avg       0.54      0.66      0.55    250000
weighted avg       0.98      0.95      0.96    250000

