In [None]:
print('setup working!')

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro, kstest, chi2_contingency
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')
print("setup done!")

In [None]:
df = pd.read_csv('Fraud_Detection_Dataset.csv')
df_clean = df.copy()
df_clean.head(10)

In [None]:
# check for information and description
df_clean.info()
df_clean.columns

In [None]:
#describe and shape
df_clean.describe()
df_clean.shape

In [None]:
#check for unique values
df_clean.nunique()

Data Cleaning

In [None]:
#drop fraudulent rows that have na to ensures that the dataset only contains rows with a valid label for model training
df_clean.dropna(subset=['Fraudulent'], inplace = False)


drop transaction id as it is just an identifier and it is not useful when it comes to determining whether a transaction is fruadulent or not

In [None]:
#drop transaction id
df_clean.drop('Transaction_ID', axis=1, inplace=True)

In [None]:
#separate numerical values and catagorical values
num_cols =  ['Transaction_Amount', 'Time_of_Transaction',
            'Previous_Fraudulent_Transactions', 'Account_Age',
            'Number_of_Transactions_Last_24H']


cat_cols =  ['Transaction_Type', 'Device_Used', 'Location', 'Payment_Method']
print(f"Numerical Columns: {num_cols}")
print(f"Categorical Columns: {cat_cols}")

In [None]:
#check missing values
missing_values = df_clean.isnull().sum().sort_values(ascending =False)
percentage_missing_values = ((missing_values / len(df_clean)) * 100).sort_values(ascending =False)
pd.concat([missing_values, percentage_missing_values], axis=1, keys=['Total', 'Percentage']).transpose()

In [None]:
#check skewness to determine which method to use to imputate the missing values
df_clean[num_cols].skew()

since we know that data is missing completely at random, therefore we can imputate mode, mean, median for frauduelent and transaction amount respectively and for categorial data i can use mode imputation as well.



In [None]:
# median
df_clean['Transaction_Amount'].fillna(df_clean['Transaction_Amount'].median(), inplace=True)
#median
df_clean['Time_of_Transaction'].fillna(df_clean['Time_of_Transaction'].median(), inplace=True)

In [None]:
# mode
for cat_col in cat_cols:
    df_clean[cat_col].fillna(df_clean[cat_col].mode()[0], inplace=True)


In [None]:
# check missing values again
missing_values = df_clean.isnull().sum().sort_values(ascending =False)
percentage_missing_values = ((missing_values / len(df_clean )) * 100).sort_values(ascending =False)
pd.concat([missing_values, percentage_missing_values], axis=1, keys=['Total', 'Percentage']).transpose()


In [None]:
#check for duplicates
duplicates = df_clean.duplicated().sum()
print('Total Duplicates:', df.duplicated().sum())

In [None]:
#remove duplicates
df_clean.drop_duplicates(inplace=True)
print("Remaining duplicates:", df_clean.duplicated().sum())


In [None]:
#check shape and fraudulent rate
print(df_clean.shape)
print(f'Fraudulent Rate: {df_clean["Fraudulent"].mean():.2%}')  

In [None]:
#change dtypes
df_clean['Transaction_Type'] = df_clean['Transaction_Type'].astype('category')
df_clean['Payment_Method'] = df_clean['Payment_Method'].astype('category')
df_clean['Device_Used'] = df_clean['Device_Used'].astype('category')
df_clean['Location'] = df_clean['Location'].astype('category')

In [None]:
#check the changes
df_clean.head()
df_clean.shape

In [None]:
print(df_clean.dtypes)

Exploratory Data Analysis

In [None]:
#transections per user
transactions_per_user = df_clean['User_ID'].value_counts()
print("Number of transactions per user:")
display(transactions_per_user)

In [None]:
print("Top 5 users with the most transactions:")
display(transactions_per_user.head(5))  

In [None]:
df_clean.head()

In [None]:
#visualize class imbalace
plt.figure(figsize=(8, 6))
sns.countplot(x='Fraudulent', data=df_clean, palette='Set2')
plt.title('Class Imbalance Visualization')
for i, count in enumerate(df_clean['Fraudulent'].value_counts()):
    plt.text(i, count + 1, str(count), ha='center', va='bottom')

plt.xlabel('Fraudulent')
plt.ylabel('Count')
plt.show()


In [None]:
#correlation_matrix
num_df = df_clean[num_cols + ['Fraudulent']]

In [None]:
# check correlation matrix
correlation_matrix = num_df.corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
#relationship between fraud and transaction amount
plt.figure(figsize=(10, 6))
sns.boxplot(x='Fraudulent', y='Transaction_Amount', data=df_clean)
plt.title('Relationship between Fraudulent and Transaction Amount')
plt.xlabel('Fraudulent')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
#relationship between account age and fraudulent
plt.figure(figsize=(8, 4))
sns.violinplot(x='Fraudulent', y='Account_Age', data=df_clean, palette='Set1')

plt.title('Relationship between Fraudulent and Account Age')
plt.xlabel('Fraudulent')
plt.ylabel('Account Age')

In [None]:
#relationship between time of transaction and fraudulent
plt.figure(figsize=(10, 6))
sns.countplot(x='Time_of_Transaction', hue='Fraudulent', data=df_clean, palette='Set1')
plt.title('Relationship between Time of Transaction and Fraudulent')
plt.xlabel('Time of Transaction')
plt.ylabel('Count')
plt.show() 

In [None]:
#chi2 graph for catagorical catagorical features
for col in cat_cols:
    plt.figure(figsize=(6,4))
    fraud_rate = df_clean.groupby(col)['Fraudulent'].mean().sort_values(ascending=False)
    sns.barplot(x=fraud_rate.index, y=fraud_rate.values, palette='coolwarm')
    plt.title(f'Fraud Rate by {col}')
    plt.ylabel('Proportion of Fraudulent Transactions')
    plt.xlabel(col)
    plt.xticks(rotation=45)
    plt.show()

In [None]:
#save new csv
df_clean.to_csv('df_clean.csv', index=False)

In [None]:
#check new csv
df_cleaned = pd.read_csv('df_clean.csv')
df_cleaned.head()

In [None]:
#check info and shape
df_cleaned.info()
df_cleaned.shape

In [None]:
# Make a copy of your dataframe
df_encoded = df_cleaned.copy()

# Identify categorical columns
categorical_cols = df_encoded.select_dtypes(include=['object', 'category']).columns

# Initialize LabelEncoder
le = LabelEncoder()



In [None]:
# encode categorical columns IN PLACE (replacing original)
for col in categorical_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    print(f"Encoded column: {col}")

# confirm all columns are numeric
print("\n Data types after encoding:")
print(df_encoded.dtypes)

# verify number of columns
print(f"\n Final number of columns: {df_encoded.shape[1]}")

In [None]:
# display first few rows of the encoded dataframe
df_encoded.head()

In [None]:
#save into csv
df_encoded.to_csv('df_encoded.csv', index=False)

Assumptions 

In [None]:
# read csv
df_encoded = pd.read_csv('df_encoded.csv')
df_encoded = df_encoded.copy()
df_encoded.head()

In [None]:
#assume
X = df_encoded.drop('Fraudulent', axis=1)
y = df_encoded['Fraudulent']


In [None]:
# Class Balance Check
print("\n" + "=" * 100)
print("ASSUMPTION 1: CLASS BALANCE")
print("=" * 100)

fraud_counts = df_encoded['Fraudulent'].value_counts()
fraud_ratio = fraud_counts[1] / fraud_counts[0]

print(f"\nClass Distribution:")
print(f"  Non-Fraud: {fraud_counts[0]:,} ({fraud_counts[0]/len(df_encoded)*100:.2f}%)")
print(f"  Fraud:     {fraud_counts[1]:,} ({fraud_counts[1]/len(df_encoded)*100:.2f}%)")
print(f"  Imbalance Ratio: 1:{1/fraud_ratio:.1f}")

if fraud_ratio < 0.1:
    print("\n WARNING: Severe class imbalance detected!")
    print("   Recommendation:")
    print("   - Use class_weight='balanced' in models")
    print("   - Consider SMOTE or undersampling")
    print("   - Use stratified cross-validation")
    print("   - Focus on Precision-Recall metrics over Accuracy")
else:
    print("\nClasses are reasonably balanced")


In [None]:
#M#ASSUMPTION 3: Feature Correlations & Multicollinearity
# ============================================================================
# Compute VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Sort results for clarity
vif_data = vif_data.sort_values('VIF', ascending=False)
print("\n VIF Results:")
print(vif_data)


Features are not strongly correlated.

No features need to be removed for linear models.

Tree-based models like Random Forest can safely use all features.

In [None]:
#ASSUMPTION 3: OUTLIERS DETECTION
# ============================================================================
print("\n" + "=" * 100)
print("ASSUMPTION 3: OUTLIERS DETECTION")
print("=" * 100)

outlier_summary = []
for col in df_encoded:
    if col in df_encoded.columns:
        Q1 = df_encoded[col].quantile(0.25)
        Q3 = df_encoded[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = df_encoded[(df_encoded[col] < lower_bound) | (df_encoded[col] > upper_bound)]
        outlier_pct = len(outliers) / len(df_encoded) * 100

        outlier_summary.append({
            'Feature': col,
            'Outliers': len(outliers),
            'Percentage': outlier_pct,
            'Lower_Bound': lower_bound,
            'Upper_Bound': upper_bound
        })

outlier_df = pd.DataFrame(outlier_summary)
print("\nOutlier Summary (IQR method):")
print(outlier_df.to_string(index=False))

severe_outliers = outlier_df[outlier_df['Percentage'] > 5]
if len(severe_outliers) > 0:
    print("\n  WARNING: Significant outliers detected!")
    print("   Impact on models:")
    print("   - Logistic Regression: Moderate (can affect coefficients)")
    print("   - KNN: High (distance-based, sensitive to outliers)")
    print("   - Tree-based: Low (splits handle outliers naturally)")
    print("\n   Recommendations:")
    print("   - Consider robust scaling for KNN and Logistic Regression")
    print("   - Investigate if outliers are fraud cases (could be legitimate signal)")
else:
    print("\n No severe outlier issues detected")


Transaction Amount has a few outliers (~1%).

Could be very small or very large transactions.

Might be worth visualizing using boxplots.

Fraudulent class shows up as an “outlier” in counts because fraud is rare (~5%).

This is not a data error; it’s just reflecting class imbalance.

All other features don’t show severe outliers, so your data is generally clean and ready for modeling.

In [None]:

# ASSUMPTION 4: FEATURE DISTRIBUTIONS

print("\n" + "=" * 100)
print("ASSUMPTION 4: FEATURE DISTRIBUTIONS (Normality)")
print("=" * 100)

print("\nShapiro-Wilk Test for Normality (sample of 5000):")
print("(p-value > 0.05 suggests normal distribution)")

normality_results = []
for col in num_cols:
    if col in df_encoded.columns:
        sample = df_encoded[col].sample(min(5000, len(df_encoded)), random_state=42)
        statistic, p_value = shapiro(sample)
        is_normal = "Yes" if p_value > 0.05 else "No"
        normality_results.append({
            'Feature': col,
            'Statistic': statistic,
            'P-Value': p_value,
            'Normal': is_normal
        })

normality_df = pd.DataFrame(normality_results)
print(normality_df.to_string(index=False))

non_normal = normality_df[normality_df['Normal'] == 'No']
if len(non_normal) > 0:
    print("\n Most features are not normally distributed")
    print("   Impact on models:")
    print("   - Logistic Regression: Low (assumes linearity, not normality of features)")
    print("   - KNN: Low (distance-based, doesn't require normality)")
    print("   - Tree-based: None (no normality assumption)")
    print("\n   Note: Non-normality of features is common and often acceptable")
else:
    print("\n Features are approximately normally distributed")



Observation: Most features are not normally distributed, which is very common in financial and transactional data.

In [None]:

# ASSUMPTION 5: LINEAR RELATIONSHIP (for Logistic Regression)

print("ASSUMPTION 5: LINEAR RELATIONSHIP WITH LOG-ODDS (Logistic Regression)")
print("=" * 100)

print("\nChecking linearity between continuous features and log-odds...")
print("(Using Box-Tidwell transformation test)")

# For each continuous feature, check linearity with logit
linear_check = []
for col in num_cols:
    if col in df_encoded.columns and col != 'Fraudulent':
        # Add small constant to avoid log(0)
        feature_log = np.log(df_encoded[col] + 1)
        correlation = df_encoded[col].corr(df_encoded['Fraudulent'])

        linear_check.append({
            'Feature': col,
            'Correlation_with_Target': correlation,
            'Likely_Linear': 'Yes' if abs(correlation) > 0.1 else 'Weak'
        })

linear_df = pd.DataFrame(linear_check)
print(linear_df.to_string(index=False))

weak_linear = linear_df[linear_df['Likely_Linear'] == 'Weak']
if len(weak_linear) > 0:
    print(f"\n {len(weak_linear)} features show weak linear relationship with target")
    print("   This may affect Logistic Regression performance")
    print("   Consider polynomial features or non-linear models")
else:
    print("\n Features show reasonable linear relationships")


All continuous features show very weak correlation with the log-odds of fraud.

Weak linearity suggests logistic regression may not fully capture the relationship.

This is common in fraud detection, as fraudulent patterns are often non-linear and sparse.

In [None]:
# ASSUMPTION 6: INDEPENDENCE OF OBSERVATIONS

print("\n" + "=" * 100)
print("ASSUMPTION 6: INDEPENDENCE OF OBSERVATIONS")
print("=" * 100)

print("\nChecking for duplicate transactions...")
duplicates = df_encoded.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

print("\n Checking User_ID transaction patterns...")
user_txn_counts = df_encoded['User_ID'].value_counts()
print(f"Unique users: {len(user_txn_counts)}")
print(f"Avg transactions per user: {user_txn_counts.mean():.2f}")
print(f"Max transactions per user: {user_txn_counts.max()}")

repeat_users = user_txn_counts[user_txn_counts > 10]
if len(repeat_users) > 0:
    print(f"\n  {len(repeat_users)} users have >10 transactions")
    print("   This violates independence assumption")
    print("   Impact on models:")
    print("   - Standard errors may be underestimated")
    print("   - Consider user-level aggregation or random effects models")
    print("   - Use stratified sampling by User_ID for train/test split")
else:
    print("\n Observations appear independent")



On average, each user has ~12 transactions.

Some users have as many as 26 transactions.

2826 users have more than 10 transactions → this is important because:

Logistic regression and other models assume observations are independent.

When a user has multiple transactions, their transactions may not be independent:

Fraud patterns could repeat per user.

The model might “see” the same user multiple times, inflating feature importance.

Non-independence can cause overfitting, especially if one user dominates the fraud class.

Tree-based models are less sensitive to this, but it’s still something to be aware of.




In [None]:
# ASSUMPTION 7: NO DATA LEAKAGE
# ============================================================================
print("\n" + "=" * 100)
print("ASSUMPTION 7: DATA LEAKAGE CHECK")
print("=" * 100)

print("\n Checking for potential data leakage...")

leakage_features = ['Previous_Fraudulent_Transactions']
print(f"\n  CRITICAL: '{leakage_features[0]}' may indicate data leakage!")
print("   This feature should be calculated from PAST transactions only")
print("   If it includes the current transaction, it's leakage")

# Check if previous fraud is perfectly correlated with target
leakage_corr = df_encoded['Previous_Fraudulent_Transactions'].corr(df_encoded['Fraudulent'])
print(f"\n   Correlation with target: {leakage_corr:.4f}")

if leakage_corr > 0.3:
    print("     WARNING: Strong correlation suggests possible leakage")
    print("   Verify this feature is computed from historical data only")
else:
    print("    Correlation seems reasonable")

# Check temporal ordering if Time_of_Transaction exists
print("\nTemporal ordering check...")
if 'Time_of_Transaction' in df_encoded.columns:
    print("    Time feature available for temporal validation")
    print("   Ensure train/test split respects temporal order for production")
else:
    print("     No explicit time ordering - ensure no future data in features")



We're checking if any feature gives the model access to information it shouldn’t have when making predictions. The feature Previous_Fraudulent_Transactions might leak data if it includes the current transaction, because that would give the model a peek at the answer.

We also look at the time feature to make sure the train/test split respects chronological order, so the model only learns from past transactions. Overall, the correlation with the target is very low (0.0008), so it seems safe, but we need to confirm the feature only uses past information.

In [None]:
#Sample Size Adequency
print("\n" + "=" * 100)
print("ASSUMPTION 8: SAMPLE SIZE ADEQUACY")
print("=" * 100)

n_samples = len(df_encoded)
n_features = len(df_encoded.drop('Fraudulent', axis=1).columns)
n_fraud = df_encoded['Fraudulent'].sum()
n_non_fraud = len(df_encoded) - n_fraud

print(f"\nSample size: {n_samples:,}")
print(f"Number of features: {n_features}")
print(f"Fraud cases: {n_fraud:,}")
print(f"Non-fraud cases: {n_non_fraud:,}")

# Rule of thumb: 10-20 events per predictor for logistic regression
min_events_needed = n_features * 10
print(f"\nMinimum fraud cases needed (10 per feature): {min_events_needed}")

if n_fraud >= min_events_needed:
    print(f" Adequate fraud samples: {n_fraud} >= {min_events_needed}")
else:
    print(f"  WARNING: Insufficient fraud samples: {n_fraud} < {min_events_needed}")
    print("   This may affect model reliability, especially Logistic Regression")

Your sample size is adequate for model training. Both the overall dataset and minority class have sufficient data. The main issue remains imbalance, not insufficiency.

Machine Learning 

In [None]:
df_encoded = pd.read_csv('df_encoded.csv')
df_encoded = df_encoded.copy()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
import joblib
import os

In [None]:
#initial models 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# Use your clean encoded df
X_baseline = df_encoded.drop(['Fraudulent', 'User_ID'], axis=1)
y_baseline = df_encoded['Fraudulent']
#baseline models
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_baseline, y_baseline, test_size=0.2, random_state=42, stratify=y_baseline
)
def train_raw_models(X_train, y_train, X_test, y_test):
    models = {
        "Logistic_Regression": LogisticRegression(max_iter=500),
        "Random_Forest": RandomForestClassifier(n_estimators=200, random_state=42),
        "Gradient_Boosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "XGBoost": XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.05,
                                 subsample=0.8, colsample_bytree=0.8,
                                 n_jobs=-1, eval_metric="logloss", random_state=42),
        "LightGBM": LGBMClassifier(n_estimators=300, learning_rate=0.05,
                                   num_leaves=63, random_state=42)
    }

    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
        y_pred = model.predict(X_test)

        print(f"AUC={auc:.4f}")
        print(classification_report(y_test, y_pred, target_names=['Non-Fraud','Fraud']))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

        results[name] = {"auc": auc}
    return results

# Example usage (before feature engineering)
results = train_raw_models(X_train, y_train, X_test, y_test)


In [None]:
#Feature engineering

def create_advanced_features(df_encoded):
    """Create powerful features for fraud detection models with refinements"""
    print("Creating advanced features for model training...")
    df_advanced = df_encoded.copy()

    # User behavior analytics
    user_stats = df_advanced.groupby('User_ID').agg({
        'Transaction_Amount': ['mean', 'std', 'min', 'max', 'count'],
        'Time_of_Transaction': ['mean', 'std'],
        'Fraudulent': ['sum', 'mean']
    }).fillna(0)

    user_stats.columns = [
        'user_avg_amount', 'user_std_amount', 'user_min_amount', 'user_max_amount', 'user_total_txns',
        'user_avg_time', 'user_std_time', 'user_fraud_count', 'user_fraud_rate_raw'
    ]
    df_advanced = df_advanced.merge(user_stats, on='User_ID', how='left')

    # Fraud rate smoothing (Laplace)
    df_advanced['user_fraud_rate'] = (
        (df_advanced['user_fraud_count'] + 1) /
        (df_advanced['user_total_txns'] + 2)
    )

    # Transaction patterns
    df_advanced['amount_to_avg_ratio'] = df_advanced['Transaction_Amount'] / (df_advanced['user_avg_amount'] + 1)
    df_advanced['amount_std_score'] = (df_advanced['Transaction_Amount'] - df_advanced['user_avg_amount']) / (df_advanced['user_std_amount'] + 1)

    # Clip extreme values
    df_advanced['amount_std_score'] = np.clip(df_advanced['amount_std_score'], -5, 5)

    # Time-based features
    df_advanced['hour_of_day'] = df_advanced['Time_of_Transaction'] % 24
    df_advanced['is_night'] = ((df_advanced['hour_of_day'] >= 22) | (df_advanced['hour_of_day'] <= 6)).astype(int)
    df_advanced['is_weekend'] = ((df_advanced['Time_of_Transaction'] // 24) % 7 >= 5).astype(int)
    df_advanced['time_sin'] = np.sin(2 * np.pi * df_advanced['hour_of_day'] / 24)
    df_advanced['time_cos'] = np.cos(2 * np.pi * df_advanced['hour_of_day'] / 24)

    # Behavioral features
    df_advanced['txn_frequency'] = df_advanced['user_total_txns'] / (df_advanced['Account_Age'] + 1)
    df_advanced['recent_activity_ratio'] = df_advanced['Number_of_Transactions_Last_24H'] / (df_advanced['user_total_txns'] + 1)

    # Composite risk feature (clipped)
    df_advanced['composite_risk_1'] = (
        df_advanced['Previous_Fraudulent_Transactions'] *
        df_advanced['amount_std_score'] *
        df_advanced['is_night']
    )
    df_advanced['composite_risk_1'] = np.clip(df_advanced['composite_risk_1'], -5, 5)

    # Risk encoding for categorical variables
    device_risk = {'Unknown Device': 3, 'Mobile': 2, 'Tablet': 1, 'Desktop': 0}
    df_advanced['device_risk_score'] = df_advanced['Device_Used'].map(device_risk).fillna(0)

    payment_risk = {'Invalid Method': 3, 'Credit Card': 2, 'Debit Card': 1, 'UPI': 0, 'Net Banking': 0}
    df_advanced['payment_risk_score'] = df_advanced['Payment_Method'].map(payment_risk).fillna(0)

    # Tiered fraud risk score (Low/Medium/High)
    df_advanced['fraud_risk_tier'] = (
        (df_advanced['user_fraud_rate'] > 0.2).astype(int) +
        (df_advanced['recent_activity_ratio'] > 0.5).astype(int) +
        (df_advanced['device_risk_score'] > 1).astype(int) +
        (df_advanced['payment_risk_score'] > 1).astype(int)
    )
    # Map to categories
    df_advanced['fraud_risk_tier'] = df_advanced['fraud_risk_tier'].map({0: 'Low', 1: 'Medium', 2: 'High', 3: 'High', 4: 'Critical'})

    # Handle infinite values and NaN
    df_advanced = df_advanced.replace([np.inf, -np.inf], 0)
    df_advanced = df_advanced.fillna(0)

    print(f"Created {len([col for col in df_advanced.columns if col not in df_encoded.columns])} advanced features")
    return df_advanced


In [None]:
df_advanced = create_advanced_features(df_encoded)  # make sure df_encoded exists

# 2️ Save to CSV so you can reuse it later
df_advanced.to_csv('df_advanced.csv', index=False)

# 3(Optional) Reload later if needed
df_advanced = pd.read_csv('df_advanced.csv')
print(df_advanced.head())

In [None]:
df_advanced.head()
df_advanced.info()
df_advanced.describe()


In [None]:
#split
X = df_advanced.drop(['Fraudulent', 'User_ID'], axis=1)
y = df_advanced['Fraudulent']

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {X_train_1.shape}, Testing samples: {X_test_1.shape}")

In [None]:


from evaluation import evaluate_model  # import your toolbox

# Load dataset
df = pd.read_csv("df_advanced.csv", engine="python", on_bad_lines="skip")
X = df.drop(columns=["Fraudulent"]).select_dtypes(include=["number"])
y = df["Fraudulent"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Define models
models = {
    "LogisticRegression_SMOTE": LogisticRegression(max_iter=2000, class_weight="balanced", solver="lbfgs"),
    "LogisticRegression_NoSMOTE": LogisticRegression(max_iter=2000, class_weight="balanced", solver="lbfgs"),
    "RandomForest": RandomForestClassifier(n_estimators=200, max_depth=6, class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, max_depth=3, random_state=42),
    "KNN_SMOTE": KNeighborsClassifier(n_neighbors=5),
    "KNN_NoSMOTE": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.1,
                             subsample=0.8, colsample_bytree=0.8, random_state=42,
                             use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
}

# Evaluate all models
results = {}
for name, model in models.items():
    # Toggle SMOTE: only skip for "NoSMOTE" versions
    use_smote = False if "NoSMOTE" in name else True
    results[name] = evaluate_model(model, X_train, y_train, X_test, y_test,
                                   recall_target=0.8, use_smote=use_smote)

# Convert results to DataFrame for leaderboard
leaderboard = pd.DataFrame(results).T
print("\nLeaderboard:\n", leaderboard.sort_values(by="auc", ascending=False))


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import pandas as pd

from evaluation import evaluate_model  # your toolbox

# Define parameter grids
param_grids = {
    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10],
        "solver": ["lbfgs", "liblinear"],
        "class_weight": ["balanced"]
    },
    "RandomForest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [4, 6, 8, None],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced"]
    },
    "GradientBoosting": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.05, 0.1, 0.2],
        "max_depth": [3, 4, 6]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.05, 0.1, 0.2],
        "max_depth": [3, 4, 6],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0]
    },
    "LightGBM": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.05, 0.1, 0.2],
        "num_leaves": [31, 50, 100],
        "max_depth": [-1, 4, 6]
    }
}

# Define base models
base_models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(random_state=42)
}

# Run RandomizedSearchCV for each model
tuned_results = {}
for name, model in base_models.items():
    print(f"Tuning {name}...")
    rand_search = RandomizedSearchCV(
        model,
        param_distributions=param_grids[name],
        n_iter=10,  # sample 10 random combos
        cv=3,
        scoring="roc_auc",
        n_jobs=-1,
        random_state=42
    )
    rand_search.fit(X_train, y_train)
    print("Best params:", rand_search.best_params_)
    
    # Evaluate tuned model
    best_model = rand_search.best_estimator_
    tuned_results[name] = evaluate_model(best_model, X_train, y_train, X_test, y_test,
                                         recall_target=0.8, use_smote=True)

# Convert results to DataFrame for tuned leaderboard
tuned_leaderboard = pd.DataFrame(tuned_results).T
print("\nTuned Leaderboard:\n", tuned_leaderboard.sort_values(by="auc", ascending=False))


In [None]:
# After tuning GradientBoosting
best_gb = rand_search.best_estimator_

# After tuning LightGBM
best_lgbm = rand_search.best_estimator_

# After tuning XGBoost
best_xgb = rand_search.best_estimator_

# After tuning RandomForest
best_rf = rand_search.best_estimator_

# Logistic Regression baseline (keep the untuned one if tuned dropped performance)
best_lr = LogisticRegression(max_iter=2000, solver="lbfgs", class_weight="balanced", random_state=42)
best_lr.fit(X_train, y_train)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score

models = {
    "GradientBoosting": best_gb,
    "LightGBM": best_lgbm,
    "XGBoost": best_xgb,
    "RandomForest": best_rf,
    "LogisticRegression": best_lr,
    "VotingClassifier": ensemble
}

# ROC curves
plt.figure(figsize=(10,7))
for name, model in models.items():
    y_scores = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    auc = roc_auc_score(y_test, y_scores)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curves for All Models")
plt.legend()
plt.grid(True)
plt.show()

# Precision-Recall curves
plt.figure(figsize=(10,7))
for name, model in models.items():
    y_scores = model.predict_proba(X_test)[:,1]
    precision, recall, _ = precision_recall_curve(y_test, y_scores)
    ap = average_precision_score(y_test, y_scores)
    plt.plot(recall, precision, label=f"{name} (AP={ap:.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves for All Models (Tuned Models)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

plt.figure(figsize=(8,6))

for name, model in [("GradientBoosting", best_gb),
                    ("LightGBM", best_lgbm),
                    ("XGBoost", best_xgb)]:
    y_scores = model.predict_proba(X_test)[:,1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
    plt.plot(recall, precision, label=name)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves (Tuned Models)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score

models = {
    "GradientBoosting": best_gb,
    "LightGBM": best_lgbm,
    "XGBoost": best_xgb,
    "RandomForest": best_rf,
    "LogisticRegression": best_lr,
    "VotingClassifier": ensemble
}

# ROC curves
plt.figure(figsize=(10,7))
for name, model in models.items():
    y_scores = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    auc = roc_auc_score(y_test, y_scores)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curves for All Models")
plt.legend()
plt.grid(True)
plt.show()

# Precision-Recall curves
plt.figure(figsize=(10,7))
for name, model in models.items():
    y_scores = model.predict_proba(X_test)[:,1]
    precision, recall, _ = precision_recall_curve(y_test, y_scores)
    ap = average_precision_score(y_test, y_scores)
    plt.plot(recall, precision, label=f"{name} (AP={ap:.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves for All Models")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve

def find_threshold_at_recall(model, X_test, y_test, target_recall=0.8):
    """Find threshold where recall is closest to target_recall and return precision, recall, threshold."""
    y_scores = model.predict_proba(X_test)[:,1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
    
    # Find index where recall is closest to target
    idx = np.argmin(np.abs(recall - target_recall))
    return {
        "precision": precision[idx],
        "recall": recall[idx],
        "threshold": thresholds[idx] if idx < len(thresholds) else 1.0
    }

# Collect results for each tuned model
threshold_results = {}
for name, model in [
    ("GradientBoosting", best_gb),
    ("LightGBM", best_lgbm),
    ("XGBoost", best_xgb),
    ("RandomForest", best_rf),
    ("LogisticRegression", best_lr)  # baseline LR
]:
    threshold_results[name] = find_threshold_at_recall(model, X_test, y_test, target_recall=0.8)

# Display results
import pandas as pd
threshold_df = pd.DataFrame(threshold_results).T
print("\nThresholds at Recall ≈ 0.8:\n", threshold_df)


In [None]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[
        ("gb", best_gb),
        ("lgbm", best_lgbm),
        ("xgb", best_xgb)
    ],
    voting="soft"  # average predicted probabilities
)

ensemble.fit(X_train, y_train)

# Evaluate ensemble
ensemble_results = evaluate_model(ensemble, X_train, y_train, X_test, y_test,
                                  recall_target=0.8, use_smote=True)
print("Ensemble results:", ensemble_results)


In [None]:
from lightgbm import LGBMClassifier
import joblib
import numpy as np
from sklearn.metrics import precision_recall_curve

# 1. Train champion model
champion = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.05,
    num_leaves=50,
    max_depth=4,
    random_state=42
)
champion.fit(X_train, y_train)

# 2. Find threshold at recall ≈ 0.8
y_scores = champion.predict_proba(X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
idx = np.argmin(np.abs(recall - 0.8))
best_threshold = thresholds[idx]

print(f"Champion: LightGBM | Threshold ≈ {best_threshold:.3f}")
print(f"Precision: {precision[idx]:.3f}, Recall: {recall[idx]:.3f}")

# 3. Save model for deployment
joblib.dump(champion, "lightgbm_champion.pkl")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Target distribution
sns.countplot(x=y)
plt.title("Fraud vs Non-Fraud Distribution")
plt.show()

# Percentage breakdown
fraud_rate = (y.sum() / len(y)) * 100
print(f"Fraud cases: {y.sum()} ({fraud_rate:.2f}% of dataset)")


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Example with LightGBM champion
y_scores = champion.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
auc = roc_auc_score(y_test, y_scores)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"LightGBM (AUC = {auc:.3f})")
plt.plot([0,1],[0,1],'k--')  # diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Plot feature importance
importances = champion.feature_importances_
features = X_train.columns

plt.figure(figsize=(10,6))
plt.barh(features, importances)
plt.title("LightGBM Feature Importance")
plt.xlabel("Importance Score")
plt.show()


In [None]:
import shap

explainer = shap.TreeExplainer(champion)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test)
