In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import time

In [None]:
class DataLoader():
    def __init__(self):
        self._data_path = "../../ieee-fraud-detection/"

    def loadCsv(self, file_name):
        with open(os.path.join(self._data_path, file_name)) as f:
            csv = pd.read_csv(f)
    
        return csv

In [None]:
def standardize_columns(df):
    df.columns = df.columns.str.replace('-', '_').str.strip()
    return df

In [None]:
dl = DataLoader()
train_transaction = standardize_columns(dl.loadCsv("train_transaction.csv"))
train_identity = standardize_columns(dl.loadCsv("train_identity.csv"))
test_transaction = standardize_columns(dl.loadCsv("test_transaction.csv"))
test_identity = standardize_columns(dl.loadCsv("test_identity.csv"))

In [None]:
# Merge on TransactionID
train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")
test = pd.merge(test_transaction, test_identity, on="TransactionID", how="left")
print(f"Combined train shape: {train.shape}")
print(f"Combined test shape: {test.shape}")

In [None]:
# === Step 3: Preserve labels and identifiers ===
y = train_transaction[['TransactionID', 'isFraud']].copy()  
original_train_ids = train_transaction['TransactionID'].copy()
original_test_ids = test_transaction['TransactionID'].copy()
train.drop(columns=['isFraud'], inplace=True, errors='ignore')
train['__dataset__'] = 'train'
test['__dataset__'] = 'test'

In [None]:

protected_cols = ['TransactionID', '__dataset__']

In [None]:
# === Step 4: Concatenate train and test for unified processing ===
combined = pd.concat([train, test], axis=0, ignore_index=True)
print(f"\nCombined shape: {combined.shape}")

In [None]:
# === Step 5: Identify categorical and numerical columns ===
cat_cols = combined.select_dtypes(include='object').columns.tolist()
num_cols = combined.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nNumber of categorical columns: {len(cat_cols)}")
print(f"Number of numerical columns: {len(num_cols)}")

In [None]:
# === Step 5: Display Percentage of Missing Data ===
missing_percent = combined.isnull().mean() * 100
missing_summary = missing_percent.reset_index()
missing_summary.columns = ['Feature', 'MissingPercent']
missing_summary = missing_summary.sort_values(by='MissingPercent', ascending=False)

print("\nMissing Data Percentage Summary:")
print(missing_summary)

In [None]:
# === Step 6: Drop columns with >75% missing in combined ===
missing_percent = combined.isnull().mean() * 100
protected_cols = ['TransactionID', '__dataset__']
high_missing_cols = [col for col in missing_percent[missing_percent > 70].index if col not in protected_cols]
combined.drop(columns=high_missing_cols, inplace=True)
print(f"\nDropped {len(high_missing_cols)} columns with >75% missing values")

In [None]:
# === Step 7: Impute Numerical Columns ===
num_cols = combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols = [col for col in num_cols if col != 'TransactionID']
missing_percent = combined[num_cols].isnull().mean() * 100

low_missing = missing_percent[(missing_percent > 0) & (missing_percent <= 15)].index
mid_missing = missing_percent[(missing_percent > 15) & (missing_percent <= 50)].index
high_missing = missing_percent[(missing_percent > 50) & (missing_percent <= 70)].index

print(f"\nLow missing (<=15%): {len(low_missing)}")
print(f"Mid missing (<=50%): {len(mid_missing)}")
print(f"High missing (<=75%): {len(high_missing)}")

In [None]:
# Mean Imputation
for col in low_missing:
    combined[col] = combined[col].fillna(combined[col].mean())

# Median Imputation
for col in mid_missing:
    combined[col] = combined[col].fillna(combined[col].median())


In [None]:
# Iterative Imputation
if len(high_missing) > 0:
    print("\nRunning IterativeImputer on high-missing numerical columns...")
    iterative = IterativeImputer(max_iter=10, random_state=0)
    combined[high_missing] = iterative.fit_transform(combined[high_missing])

In [None]:
# === Step 8: Impute Categorical Columns with Mode ===
for col in cat_cols:
    if col in combined.columns:
        mode_val = combined[col].mode()[0] if not combined[col].mode().empty else 'missing'
        combined[col] = combined[col].fillna(mode_val)

In [None]:
# === Step 9: Encode Categorical Columns ===
for col in cat_cols:
    if col in combined.columns and col not in protected_cols:
        combined[col] = combined[col].astype('category').cat.codes

In [None]:
print(f"\nCleaned train shape: {combined.shape}")

In [None]:
# === Step 10: Split Combined Data Back into Train and Test ===
print("\n__dataset__ column in combined:", '__dataset__' in combined.columns)
print("Value counts for __dataset__:\n", combined['__dataset__'].value_counts())
train_cleaned = combined[combined['__dataset__'] == 'train'].drop(columns='__dataset__').reset_index(drop=True)
test_cleaned = combined[combined['__dataset__'] == 'test'].drop(columns='__dataset__').reset_index(drop=True)

# Reattach target variable using TransactionID
print("train_cleaned columns:", train_cleaned.columns.tolist())
print("y columns:", y.columns.tolist())
assert 'TransactionID' in train_cleaned.columns, "TransactionID missing in train_cleaned"
assert 'TransactionID' in y.columns, "TransactionID missing in y"
train_cleaned = pd.merge(train_cleaned, y, on='TransactionID', how='left')

In [None]:
# === Step 11: Final Checks ===
print("\nRemaining missing values in train:", train_cleaned.isnull().sum().sum())
print("Remaining missing values in test:", test_cleaned.isnull().sum().sum())
print(f"\nCleaned train shape: {train_cleaned.shape}")
print(f"Cleaned test shape: {test_cleaned.shape}")

In [None]:
# === Step 12: Check that original TransactionIDs match ===
assert set(original_train_ids) == set(train_cleaned['TransactionID']), "Mismatch in train TransactionIDs!"
assert set(original_test_ids) == set(test_cleaned['TransactionID']), "Mismatch in test TransactionIDs!"
print("\n TransactionID integrity check passed for both train and test.")

In [None]:
# === Step 14.1: Target Class Distribution ===
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6, 4))
sns.countplot(x='isFraud', data=train_cleaned)
plt.title('Distribution of Fraudulent vs Non-Fraudulent Transactions')
plt.xlabel('isFraud')
plt.ylabel('Count')
plt.grid(True)
plt.show()

fraud_rate = train_cleaned['isFraud'].mean()
print(f"Fraudulent transactions: {fraud_rate:.4f} ({fraud_rate * 100:.2f}%)")


In [None]:
# === Step 14.2: Summary Statistics ===
print("\nSummary statistics for numerical features:")
display(train_cleaned.describe())

print("\nNumber of unique values per feature:")
display(train_cleaned.nunique().sort_values(ascending=False).head(20))


In [None]:
# === Step 14.3: Feature Correlation with isFraud ===
correlations = train_cleaned.corr(numeric_only=True)['isFraud'].drop('isFraud').sort_values(key=abs, ascending=False)
top_corr_features = correlations.head(20)

plt.figure(figsize=(10, 6))
top_corr_features.plot(kind='barh')
plt.title('Top 20 Features Correlated with isFraud')
plt.xlabel('Correlation')
plt.grid(True)
plt.gca().invert_yaxis()
plt.show()


In [None]:
# === Step 14.4: Correlation Heatmap (Top Features) ===
top_features = top_corr_features.index.tolist()
plt.figure(figsize=(12, 10))
sns.heatmap(train_cleaned[top_features + ['isFraud']].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Heatmap of Top Correlated Features with isFraud')
plt.show()


In [None]:
# === Step 14.5: Feature Distributions by isFraud ===
important_feats = top_corr_features.head(5).index.tolist()

for col in important_feats:
    plt.figure(figsize=(8, 4))
    sns.kdeplot(data=train_cleaned, x=col, hue='isFraud', fill=True, common_norm=False)
    plt.title(f'Distribution of {col} by isFraud')
    plt.grid(True)
    plt.show()


In [None]:
# === Step 14.7: Chi-Square Test for Categorical Features ===
from scipy.stats import chi2_contingency

cat_cols_cleaned = [col for col in train_cleaned.columns if str(train_cleaned[col].dtype) in ['int8', 'int16', 'int32', 'int64']
                    and train_cleaned[col].nunique() < 50 and col != 'isFraud']

chi2_results = []

for col in cat_cols_cleaned:
    contingency_table = pd.crosstab(train_cleaned[col], train_cleaned['isFraud'])
    if contingency_table.shape[0] > 1:  # skip degenerate cases
        chi2, p, _, _ = chi2_contingency(contingency_table)
        chi2_results.append((col, p))

chi2_results = sorted(chi2_results, key=lambda x: x[1])
print("Top categorical features by Chi-square p-value (lower is better):")
for col, p in chi2_results[:10]:
    print(f"{col}: p = {p:.4e}")


In [None]:
# Reconstruct mapping for understanding what encoded values represent
categorical_value_mappings = {}

for col in cat_cols_cleaned:
    # Attempt to map numeric value back to original label using group counts
    value_counts = train[col].value_counts(dropna=False)  # original pre-encoded data if available
    if train[col].dtype == 'object' or train[col].nunique() < 100:
        value_map = train[[col]].drop_duplicates().reset_index(drop=True)
        value_map['encoded'] = train_cleaned[col]
        mapping = dict(zip(value_map['encoded'], value_map[col]))
        categorical_value_mappings[col] = mapping

# Print out a few mappings
for col, mapping in list(categorical_value_mappings.items())[:5]:
    print(f"\nMapping for {col}:")
    for k, v in sorted(mapping.items()):
        print(f"  {k} → '{v}'")


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import json

# === STEP 1: Encode categorical features and save mappings ===

# Detect categorical columns
categorical_cols = [col for col in train.columns
                    if train[col].dtype == 'object' or (train[col].nunique() < 50 and col != 'isFraud')]

label_encoders = {}
encoding_maps = {}
encoded_data = {}  # temp dict to hold encoded columns

# Encode all categorical columns in one pass
for col in categorical_cols:
    le = LabelEncoder()
    encoded_col = le.fit_transform(train[col].astype(str))
    encoded_data[col] = encoded_col
    label_encoders[col] = le
    encoding_maps[col] = {int(k): v for k, v in zip(le.transform(le.classes_), le.classes_)}

# Combine all encoded columns into a new DataFrame
encoded_df = pd.DataFrame(encoded_data, index=train.index)

# Create a de-fragmented version of train_cleaned and update with encoded features
train_cleaned = train_cleaned.copy()
train_cleaned = pd.concat([train_cleaned.drop(columns=encoded_df.columns, errors='ignore'), encoded_df], axis=1)

# Optionally save mappings to JSON
with open("label_encoding_mappings.json", "w") as f:
    json.dump(encoding_maps, f, indent=2)

# === STEP 2: Chi-Square Test for Categorical Features ===

cat_cols_cleaned = [col for col in train_cleaned.columns
                    if str(train_cleaned[col].dtype) in ['int8', 'int16', 'int32', 'int64']
                    and train_cleaned[col].nunique() < 50 and col != 'isFraud']

chi2_results = []

for col in cat_cols_cleaned:
    contingency_table = pd.crosstab(train_cleaned[col], train_cleaned['isFraud'])
    if contingency_table.shape[0] > 1:
        chi2, p, _, _ = chi2_contingency(contingency_table)
        chi2_results.append((col, p))

chi2_results = sorted(chi2_results, key=lambda x: x[1])

print("Top categorical features by Chi-square p-value (lower is better):")
for col, p in chi2_results[:10]:
    print(f"{col}: p = {p:.4e}")

# === STEP 3: Plot Fraud Rate by Top Categorical Features ===

for col, _ in chi2_results[:5]:  # adjust number as needed
    plt.figure(figsize=(8, 4))
    fraud_rate = train_cleaned.groupby(col)['isFraud'].mean()

    # Use original category labels if available
    if col in encoding_maps:
        x_labels = [encoding_maps[col].get(idx, str(idx)) for idx in fraud_rate.index]
    else:
        x_labels = fraud_rate.index.astype(str)

    sns.barplot(x=x_labels, y=fraud_rate.values)
    plt.title(f"Fraud Rate by {col}")
    plt.ylabel("Fraud Rate")
    plt.xlabel(col)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# ===Fraud rate by category===

import matplotlib.pyplot as plt
import seaborn as sns

for col, _ in chi2_results[:5]:
    plt.figure(figsize=(8, 4))
    fraud_rate = train_cleaned.groupby(col)['isFraud'].mean()
    sns.barplot(x=fraud_rate.index.astype(str), y=fraud_rate.values)
    plt.title(f"Fraud Rate by {col}")
    plt.ylabel("Fraud Rate")
    plt.xlabel(col)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

In [None]:
# === Count vs Fraud rate ===

for col, _ in chi2_results[:3]:
    fig, ax1 = plt.subplots(figsize=(8, 4))

    ax2 = ax1.twinx()
    sns.countplot(x=col, data=train_cleaned, ax=ax1, color='skyblue')
    fraud_rate = train_cleaned.groupby(col)['isFraud'].mean()
    ax2.plot(fraud_rate.index.astype(str), fraud_rate.values, color='red', marker='o')

    ax1.set_ylabel('Count')
    ax2.set_ylabel('Fraud Rate')
    plt.title(f"Count and Fraud Rate by {col}")
    plt.grid(True)
    plt.show()

In [None]:
# === Categorical Grouped Fraud Rates ===
grouped_rates = {}

for col in cat_cols_cleaned:
    group_mean = train_cleaned.groupby(col)['isFraud'].mean()
    if group_mean.nunique() > 1:  # only keep meaningful differences
        grouped_rates[col] = group_mean.sort_values(ascending=False)

# Show a few examples
for i, (col, rate_series) in enumerate(grouped_rates.items()):
    print(f"\nFraud Rate by {col}:")
    print(rate_series)
    if i >= 2:  # Limit output to top 3
        break


In [None]:
!pip install lightgbm

In [None]:
# ===  Feature Importance from LightGBM ===
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Drop non-predictive identifiers
X = train_cleaned.drop(columns=['TransactionID', 'isFraud'])
y = train_cleaned['isFraud']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Plot feature importance
lgb.plot_importance(model, max_num_features=20, importance_type='gain', figsize=(10, 6))
plt.title('Top 20 Feature Importances (LightGBM)')
plt.show()


In [None]:
# === Auto-bin & Encode High Cardinality Features ===
high_cardinality = [col for col in train_cleaned.columns if train_cleaned[col].nunique() > 100 and col != 'TransactionID']

# Frequency encoding
for col in high_cardinality:
    freq_map = train_cleaned[col].value_counts().to_dict()
    train_cleaned[col + '_freq'] = train_cleaned[col].map(freq_map)
    test_cleaned[col + '_freq'] = test_cleaned[col].map(freq_map)

print(f"Encoded {len(high_cardinality)} high-cardinality features with frequency encoding.")


In [None]:
# === LightGBM with Cross-Validation ===
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

X = train_cleaned.drop(columns=['TransactionID', 'isFraud'])
y = train_cleaned['isFraud']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(n_estimators=100, random_state=fold)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)
    auc_scores.append(auc)
    print(f"Fold {fold + 1} AUC: {auc:.4f}")

print(f"\nAverage AUC: {np.mean(auc_scores):.4f}")


In [None]:
# === SHAP Explanation ===
import shap

explainer = shap.Explainer(model)
shap_values = explainer(X_val)

# Summary plot for top features
shap.summary_plot(shap_values, X_val, max_display=15)


In [None]:
# === Step 15.4 (Safe Version): Create a separate copy for feature engineering ===

# Avoid modifying the original cleaned datasets
train_feat = train_cleaned.copy()
test_feat = test_cleaned.copy()

interaction_features_train = []
interaction_features_test = []
interaction_feature_names = []

# === Handle SHAP value format (supports both old and new SHAP versions) ===
try:
    # Newer SHAP returns Explanation objects
    base_shap_values = shap_values.values
except AttributeError:
    base_shap_values = shap_values[1]  # binary classification, class 1

# Recompute top SHAP features if needed
shap_importance_base = np.abs(base_shap_values).mean(axis=0)
top_indices = np.argsort(shap_importance_base)[-5:][::-1]
top_shap_features = X_val.columns[top_indices].tolist()

print("Top 5 features by SHAP importance (from original model):")
print(top_shap_features)

# === Generate interaction features (ADD, SUB, MUL, DIV) from top SHAP features ===
for i in range(len(top_shap_features)):
    for j in range(i + 1, len(top_shap_features)):
        f1 = top_shap_features[i]
        f2 = top_shap_features[j]

        new_add = f"{f1}_plus_{f2}"
        new_sub = f"{f1}_minus_{f2}"
        new_mul = f"{f1}_times_{f2}"
        new_div = f"{f1}_div_{f2}"

        interaction_feature_names.extend([new_add, new_sub, new_mul, new_div])

        # Train interactions
        train_inter = pd.DataFrame({
            new_add: train_feat[f1] + train_feat[f2],
            new_sub: train_feat[f1] - train_feat[f2],
            new_mul: train_feat[f1] * train_feat[f2],
            new_div: train_feat[f1] / (train_feat[f2] + 1e-5)
        })

        # Test interactions
        test_inter = pd.DataFrame({
            new_add: test_feat[f1] + test_feat[f2],
            new_sub: test_feat[f1] - test_feat[f2],
            new_mul: test_feat[f1] * test_feat[f2],
            new_div: test_feat[f1] / (test_feat[f2] + 1e-5)
        })

        interaction_features_train.append(train_inter)
        interaction_features_test.append(test_inter)

# Concatenate all engineered features at once (fast and efficient)
train_feat = pd.concat([train_feat] + interaction_features_train, axis=1)
test_feat = pd.concat([test_feat] + interaction_features_test, axis=1)

print(f"\n Added {len(interaction_feature_names)} interaction features to `train_feat` and `test_feat`.")


In [None]:
# ===SHAP Analysis With Interaction Features ===

# Drop non-predictive columns
X_feat = train_feat.drop(columns=['TransactionID', 'isFraud'], errors='ignore')
y_feat = train_feat['isFraud']

# Train a new LightGBM model using the feature-engineered data
model_feat = lgb.LGBMClassifier(n_estimators=100, random_state=42)
model_feat.fit(X_feat, y_feat)

# Create SHAP explainer and compute values
explainer_feat = shap.TreeExplainer(model_feat)
shap_vals_feat = explainer_feat.shap_values(X_feat)

# Support newer SHAP versions (shap_values is a list for binary classification)
if isinstance(shap_vals_feat, list) and len(shap_vals_feat) == 2:
    shap_to_plot = shap_vals_feat[1]  # Class 1: isFraud
else:
    shap_to_plot = shap_vals_feat

# === SHAP Summary Plot ===
print("\n SHAP Summary Plot With Interaction Features:")
shap.summary_plot(shap_to_plot, X_feat, max_display=20)

# === Top SHAP Features Report ===
shap_imp_feat = np.abs(shap_to_plot).mean(axis=0)
top_feat_idx = np.argsort(shap_imp_feat)[-10:][::-1]
top_feat_names = X_feat.columns[top_feat_idx].tolist()

print("\n Top SHAP features with interactions:")
print(top_feat_names)


In [None]:
# === Train LightGBM and Plot ROC Curve ===

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

X = train_cleaned.drop(columns=['TransactionID', 'isFraud'])
y = train_cleaned['isFraud']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_scores = model.predict_proba(X_val)[:, 1]

fpr, tpr, thresholds = roc_curve(y_val, y_scores)
auc = roc_auc_score(y_val, y_scores)

plt.figure(figsize=(8, 6))
sns.lineplot(x=fpr, y=tpr, label=f"LightGBM AUC = {auc:.4f}")
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - LightGBM")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# === Step 16.2: Precision-Recall Curve ===
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, _ = precision_recall_curve(y_val, y_scores)
avg_precision = average_precision_score(y_val, y_scores)

plt.figure(figsize=(8, 6))
sns.lineplot(x=recall, y=precision, label=f"Avg Precision = {avg_precision:.4f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve - LightGBM")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# === Compare ROC Curves of LightGBM vs XGBoost ===
from xgboost import XGBClassifier

model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_xgb.fit(X_train, y_train)
y_scores_xgb = model_xgb.predict_proba(X_val)[:, 1]

fpr_xgb, tpr_xgb, _ = roc_curve(y_val, y_scores_xgb)
auc_xgb = roc_auc_score(y_val, y_scores_xgb)

plt.figure(figsize=(8, 6))
sns.lineplot(x=fpr, y=tpr, label=f"LightGBM AUC = {auc:.4f}")
sns.lineplot(x=fpr_xgb, y=tpr_xgb, label=f"XGBoost AUC = {auc_xgb:.4f}")
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# ===Cross-Validated AUC ===
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model_cv = lgb.LGBMClassifier(n_estimators=100, random_state=fold)
    model_cv.fit(X_train, y_train)
    y_pred = model_cv.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    auc_scores.append(auc)
    print(f"Fold {fold + 1} AUC: {auc:.4f}")

print(f"\nAverage Cross-Validated AUC: {np.mean(auc_scores):.4f}")


In [None]:
# === Train Model Using Only Top SHAP Features (Before Interaction Engineering) ===

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure top_shap_features exists — redefine if needed
try:
    top_shap_features
except NameError:
    # Use original SHAP values from earlier session
    try:
        base_shap_values = shap_values.values
    except AttributeError:
        base_shap_values = shap_values[1]

    shap_importance_base = np.abs(base_shap_values).mean(axis=0)
    top_indices = np.argsort(shap_importance_base)[-5:][::-1]
    top_shap_features = X_val.columns[top_indices].tolist()
    print("Extracted top SHAP features (pre-feature engineering):", top_shap_features)

# Select only these features from train_cleaned
X_base = train_cleaned[top_shap_features]
y_base = train_cleaned['isFraud']

# Train-test split
Xb_train, Xb_val, yb_train, yb_val = train_test_split(X_base, y_base, stratify=y_base, test_size=0.2, random_state=42)

# Train LightGBM
model_base = lgb.LGBMClassifier(n_estimators=100, random_state=42)
model_base.fit(Xb_train, yb_train)

# Predict
yb_scores = model_base.predict_proba(Xb_val)[:, 1]
auc_base = roc_auc_score(yb_val, yb_scores)
fpr, tpr, _ = roc_curve(yb_val, yb_scores)

# Plot ROC
plt.figure(figsize=(8, 6))
sns.lineplot(x=fpr, y=tpr, label=f"Top SHAP-only AUC = {auc_base:.4f}")
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SHAP Feature Subset")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# === Compare ROC Curves Across Models Using All Features ===

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Prepare data
X_all = train_cleaned.drop(columns=['TransactionID', 'isFraud'])
y_all = train_cleaned['isFraud']

X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, stratify=y_all, test_size=0.2, random_state=42)

# 2. Define models
models = {
    "LightGBM": lgb.LGBMClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "LogisticRegression": LogisticRegression(max_iter=1000, solver='lbfgs')
}

# 3. Train & evaluate
plt.figure(figsize=(10, 7))
for name, model in models.items():
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    fpr, tpr, _ = roc_curve(y_val, probs)
    auc = roc_auc_score(y_val, probs)
    sns.lineplot(x=fpr, y=tpr, label=f"{name} (AUC = {auc:.4f})")

# 4. Plot random line for reference
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison - Full Feature Set")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
pip install optuna


In [None]:
# === LightGBM + Optuna Hyperparameter Tuning + ROC Curve ===

import optuna
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data
X = train_cleaned.drop(columns=['TransactionID', 'isFraud'])
y = train_cleaned['isFraud']
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Objective function for Optuna
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 16, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 50, 300),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'n_estimators': 100,
        'random_state': 42
    }
    
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)
    return auc

# Start Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # You can increase to 50+ for better results

# Best model
best_params = study.best_params
print(f"\n Best AUC: {study.best_value:.4f}")
print("Best parameters:")
print(best_params)

# Train final model with best params
best_model = lgb.LGBMClassifier(**best_params, n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)
y_scores = best_model.predict_proba(X_val)[:, 1]
fpr, tpr, _ = roc_curve(y_val, y_scores)
auc = roc_auc_score(y_val, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
sns.lineplot(x=fpr, y=tpr, label=f"Optimized LightGBM AUC = {auc:.4f}")
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve After LightGBM Optimization (Optuna)")
plt.grid(True)
plt.legend()
plt.show()
