In [1]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler


df = pd.read_csv('../../click_fraud_dataset.csv')

In [2]:
# Drop unnecessary columns
drop_cols = ["click_id", "timestamp", "user_id", "referrer_url", "page_url", "ad_position", "bot_likelihood_score"]
df = df.drop(columns=drop_cols)

In [3]:
import geoip2.database

city_db = "../GeoLite2/GeoLite2-City.mmdb"
asn_db = "../GeoLite2/GeoLite2-ASN.mmdb"

reader_city = geoip2.database.Reader(city_db)
reader_asn = geoip2.database.Reader(asn_db)

def enrich_ip(ip):
    data = {
        "ip_country": None,
        "ip_city": None,
        "ip_asn": None,
        "ip_org": None,
        "ip_is_datacenter": 0
    }
    try:
        city = reader_city.city(ip)
        asn = reader_asn.asn(ip)

        data["ip_country"] = city.country.name
        data["ip_city"] = city.city.name
        data["ip_asn"] = asn.autonomous_system_number
        data["ip_org"] = asn.autonomous_system_organization

        # crude datacenter detection
        dc_keywords = ['Amazon', 'Google', 'OVH', 'Microsoft', 'DigitalOcean', 'Hetzner', 'Linode']
        if any(k in (asn.autonomous_system_organization or "") for k in dc_keywords):
            data["ip_is_datacenter"] = 1
    except Exception:
        pass
    return pd.Series(data)

# Example use:
ip_info = enrich_ip("203.94.72.111")
print(ip_info)


ip_country                           Sri Lanka
ip_city                                Colombo
ip_asn                                    9329
ip_org              Sri Lanka Telecom Internet
ip_is_datacenter                             0
dtype: object


In [4]:
df = df.join(df['ip_address'].apply(enrich_ip))
df.head()

Unnamed: 0,ip_address,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,device_ip_reputation,VPN_usage,proxy_usage,is_fraudulent,ip_country,ip_city,ip_asn,ip_org,ip_is_datacenter
0,141.36.49.37,Tablet,Safari,Android,0.29,60,111,8,7,72,Good,0,1,0,,,,,0.0
1,216.29.19.201,Desktop,Opera,iOS,0.64,25,452,29,9,201,Suspicious,0,0,0,United States,,174.0,COGENT-174,0.0
2,167.133.41.231,Tablet,Safari,Linux,0.42,36,431,18,9,326,Good,0,1,0,,,,,0.0
3,216.146.33.78,Tablet,Edge,macOS,4.29,29,472,37,4,33,Suspicious,0,0,0,United States,Ashburn,31898.0,ORACLE-BMC-31898,0.0
4,146.37.54.245,Desktop,Opera,Windows,2.46,94,50,2,7,97,Good,0,0,0,,,,,0.0


In [5]:
# --- Derived behavioral features and IP risk flag (Ensemble stacking) ---
for col in ['click_duration','click_frequency','scroll_depth','mouse_movement','keystrokes_detected','time_since_last_click','VPN_usage','proxy_usage','ip_is_datacenter']:
    if col in df.columns:
        df[col] = df[col].fillna(0)

epsilon = 1e-5

df['click_intensity'] = df['click_frequency'] / (df['click_duration'] + epsilon)

df['engagement_ratio'] = (df['scroll_depth'] + df['mouse_movement']) / (df['click_duration'] + 1)

df['inactivity_ratio'] = df['time_since_last_click'] / (df['click_duration'] + 1)

df['input_activity'] = df['keystrokes_detected'] + df['mouse_movement']

if 'device_ip_reputation' in df.columns and df['device_ip_reputation'].dtype == object:
    df['ip_risk_flag'] = ((df['device_ip_reputation'].isin(['Poor','Unknown'])) | (df['ip_is_datacenter'] == 1) | (df['VPN_usage'] == 1) | (df['proxy_usage'] == 1)).astype(int)
else:
    df['ip_risk_flag'] = ((df.get('ip_is_datacenter',0) == 1) | (df.get('VPN_usage',0) == 1) | (df.get('proxy_usage',0) == 1)).astype(int)

print('Derived features added for ensemble stacking')
df.head()

Derived features added for ensemble stacking


Unnamed: 0,ip_address,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,...,ip_country,ip_city,ip_asn,ip_org,ip_is_datacenter,click_intensity,engagement_ratio,inactivity_ratio,input_activity,ip_risk_flag
0,141.36.49.37,Tablet,Safari,Android,0.29,60,111,8,7,72,...,,,,,0.0,24.137099,132.55814,55.813953,119,1
1,216.29.19.201,Desktop,Opera,iOS,0.64,25,452,29,9,201,...,United States,,174.0,COGENT-174,0.0,14.06228,290.853659,122.560976,481,0
2,167.133.41.231,Tablet,Safari,Linux,0.42,36,431,18,9,326,...,,,,,0.0,21.428061,328.873239,229.577465,449,1
3,216.146.33.78,Tablet,Edge,macOS,4.29,29,472,37,4,33,...,United States,Ashburn,31898.0,ORACLE-BMC-31898,0.0,0.932399,94.706994,6.238185,509,0
4,146.37.54.245,Desktop,Opera,Windows,2.46,94,50,2,7,97,...,,,,,0.0,2.845517,41.618497,28.034682,52,0


In [6]:
from user_agents import parse

def parse_user_agent(ua_str):
    try:
        ua = parse(ua_str)
        return pd.Series({
            "ua_browser": ua.browser.family,
            "ua_os": ua.os.family,
            "ua_device": ua.device.family,
            "ua_is_mobile": ua.is_mobile,
            "ua_is_tablet": ua.is_tablet,
            "ua_is_pc": ua.is_pc,
            "ua_is_bot": ua.is_bot
        })
    except Exception:
        return pd.Series({
            "ua_browser": None,
            "ua_os": None,
            "ua_device": None,
            "ua_is_mobile": False,
            "ua_is_tablet": False,
            "ua_is_pc": False,
            "ua_is_bot": False
        })

# Example:
ua_info = parse_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/117.0.0.0 Safari/537.36")
print(ua_info)


ua_browser       Chrome
ua_os           Windows
ua_device         Other
ua_is_mobile      False
ua_is_tablet      False
ua_is_pc           True
ua_is_bot         False
dtype: object


In [7]:
import ipaddress

def is_private_ip(ip):
    try:
        return ipaddress.ip_address(ip).is_private
    except:
        return False

# Enhanced fraud scoring using derived features
def fraud_score(row):
    score = 0

    # 1️⃣ IP-based
    if row.get("ip_is_datacenter") == 1:
        score += 3
    if is_private_ip(row.get("ip_address", "")):
        score += 2
    if row.get("VPN_usage") == 1 or row.get("proxy_usage") == 1:
        score += 2
    if row.get('ip_risk_flag', 0) == 1:
        score += 2

    # 2️⃣ Device & OS mismatch
    os = str(row.get("operating_system", "")).lower()
    browser = str(row.get("browser", "")).lower()
    device = str(row.get("device_type", "")).lower()

    if "windows" in os and "safari" in browser:
        score += 2
    if device == "mobile" and row.get("mouse_movement", 0) > 0:
        score += 1

    # 3️⃣ User-agent analysis (if parsed)
    if row.get("ua_is_bot", False):
        score += 4

    # 4️⃣ Behavior - derived features influence
    click_intensity = float(row.get("click_intensity", 0))
    if click_intensity > 10:  # threshold can be tuned
        score += 2
    elif click_intensity > 4:
        score += 1

    # Engagement ratio: low engagement (low scroll+movement relative to time) could be bot-like
    engagement_ratio = float(row.get("engagement_ratio", 0))
    if engagement_ratio < 0.5:
        score += 1

    # Inactivity bursts followed by short actions suggest automation
    inactivity_ratio = float(row.get("inactivity_ratio", 0))
    if inactivity_ratio > 100:  # tune this threshold to dataset
        score += 2
    elif inactivity_ratio > 30:
        score += 1

    # Input activity: higher input indicates human
    input_activity = float(row.get("input_activity", 0))
    if input_activity > 100:
        score -= 2  # reduce fraud score (good)
    elif input_activity > 30:
        score -= 1

    # 5️⃣ Quick clicks
    if row.get("click_duration", 10) < 1.0:
        score += 1

    # Ensure score is non-negative
    score = max(0, score)

    return score

# Apply to dataset
df["fraud_score"] = df.apply(fraud_score, axis=1)

# Decide labels
df["predicted_label"] = (df["fraud_score"] >= 4).astype(int)  # 1=fraudulent, 0=genuine

df[["ip_country", "device_type", "browser", "operating_system", "fraud_score", "predicted_label"]].head()

Unnamed: 0,ip_country,device_type,browser,operating_system,fraud_score,predicted_label
0,,Tablet,Safari,Android,6,1
1,United States,Desktop,Opera,iOS,3,0
2,,Tablet,Safari,Linux,7,1
3,United States,Tablet,Edge,macOS,0,0
4,,Desktop,Opera,Windows,0,0


In [9]:
# Convert all ASN values to string and fill NaN with "Unknown"
df["ip_asn"] = df["ip_asn"].astype(str).fillna("Unknown")

# Optional: Clean numeric-only ASNs (make them consistent like 'AS12345')
df["ip_asn"] = df["ip_asn"].apply(lambda x: f"AS{x}" if x.isdigit() else x)
df.head()


Unnamed: 0,ip_address,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,...,ip_asn,ip_org,ip_is_datacenter,click_intensity,engagement_ratio,inactivity_ratio,input_activity,ip_risk_flag,fraud_score,predicted_label
0,141.36.49.37,Tablet,Safari,Android,0.29,60,111,8,7,72,...,,,0.0,24.137099,132.55814,55.813953,119,1,6,1
1,216.29.19.201,Desktop,Opera,iOS,0.64,25,452,29,9,201,...,174.0,COGENT-174,0.0,14.06228,290.853659,122.560976,481,0,3,0
2,167.133.41.231,Tablet,Safari,Linux,0.42,36,431,18,9,326,...,,,0.0,21.428061,328.873239,229.577465,449,1,7,1
3,216.146.33.78,Tablet,Edge,macOS,4.29,29,472,37,4,33,...,31898.0,ORACLE-BMC-31898,0.0,0.932399,94.706994,6.238185,509,0,0,0
4,146.37.54.245,Desktop,Opera,Windows,2.46,94,50,2,7,97,...,,,0.0,2.845517,41.618497,28.034682,52,0,0,0


In [10]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoders = {}
for col in ["device_type", "browser", "operating_system", "device_ip_reputation", "ip_country", "ip_city", "ip_asn", "ip_org"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()

Unnamed: 0,ip_address,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,...,ip_asn,ip_org,ip_is_datacenter,click_intensity,engagement_ratio,inactivity_ratio,input_activity,ip_risk_flag,fraud_score,predicted_label
0,141.36.49.37,2,4,0,0.29,60,111,8,7,72,...,1227,1147,0.0,24.137099,132.55814,55.813953,119,1,6,1
1,216.29.19.201,0,3,3,0.64,25,452,29,9,201,...,251,179,0.0,14.06228,290.853659,122.560976,481,0,3,0
2,167.133.41.231,2,4,1,0.42,36,431,18,9,326,...,1227,1147,0.0,21.428061,328.873239,229.577465,449,1,7,1
3,216.146.33.78,2,1,4,4.29,29,472,37,4,33,...,615,689,0.0,0.932399,94.706994,6.238185,509,0,0,0
4,146.37.54.245,0,3,2,2.46,94,50,2,7,97,...,1227,1147,0.0,2.845517,41.618497,28.034682,52,0,0,0


In [11]:
# Drop ip_address after enrichment
df = df.drop(columns=['ip_address'])
df.head()

Unnamed: 0,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,device_ip_reputation,...,ip_asn,ip_org,ip_is_datacenter,click_intensity,engagement_ratio,inactivity_ratio,input_activity,ip_risk_flag,fraud_score,predicted_label
0,2,4,0,0.29,60,111,8,7,72,1,...,1227,1147,0.0,24.137099,132.55814,55.813953,119,1,6,1
1,0,3,3,0.64,25,452,29,9,201,2,...,251,179,0.0,14.06228,290.853659,122.560976,481,0,3,0
2,2,4,1,0.42,36,431,18,9,326,1,...,1227,1147,0.0,21.428061,328.873239,229.577465,449,1,7,1
3,2,1,4,4.29,29,472,37,4,33,2,...,615,689,0.0,0.932399,94.706994,6.238185,509,0,0,0
4,0,3,2,2.46,94,50,2,7,97,1,...,1227,1147,0.0,2.845517,41.618497,28.034682,52,0,0,0


In [13]:
from sklearn.preprocessing import StandardScaler

# Define numeric columns (added derived features)
numeric_cols = ['click_duration',
                'scroll_depth',
                'mouse_movement',
                'keystrokes_detected',
                'click_frequency',
                'time_since_last_click',
                'device_ip_reputation',
                'device_type',
                'browser',
                'operating_system',
                'ip_country',
                'ip_city',
                'ip_asn',
                'ip_org',
                'VPN_usage',
                'proxy_usage',
                'ip_is_datacenter',
                # Derived features
                'click_intensity',
                'engagement_ratio',
                'inactivity_ratio',
                'input_activity',
                'ip_risk_flag']

scaler = StandardScaler()
# Fill NaNs and scale
df[numeric_cols] = scaler.fit_transform(df[numeric_cols].fillna(0))

print("Scaling completed!")

Scaling completed!


In [14]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["is_fraudulent", "predicted_label"])
y = df["is_fraudulent"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Data ready for model training!")
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Data ready for model training!
Train size: (4000, 23)
Test size: (1000, 23)


In [15]:
# Load all trained base models

with open('../Ensemble_Model/Base_Models/xgb_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

with open('../Ensemble_Model/Base_Models/kmeans_model.pkl', 'rb') as f:
    kmeans_model = pickle.load(f)

autoencoder = load_model(
    '../Ensemble_Model/Base_Models/autoencoder_model.h5',
    custom_objects={"mse": tf.keras.losses.MeanSquaredError()},
    compile=True
)



In [18]:
# Generate base model predictions

# 1️⃣ XGBoost prediction probability
xgb_probs = xgb_model.predict_proba(X)[:, 1]
print("🔹 XGBoost predicted probabilities (first 10):")
print(xgb_probs[:10])
print()

# 2️⃣ Autoencoder reconstruction error
reconstructed = autoencoder.predict(X)
autoencoder_mse = np.mean(np.power(X.values - reconstructed, 2), axis=1)
print("🔹 Autoencoder reconstruction errors (first 10):")
print(autoencoder_mse[:10])
print()


# 3️⃣ KMeans distance to closest centroid
distances = kmeans_model.transform(X)
min_distances = np.min(distances, axis=1)
print("🔹 KMeans minimum distances to centroid (first 10):")
print(min_distances[:10])
print()

# --- Summary shapes ---
print("✅ Summary:")
print(f"XGBoost probs shape: {xgb_probs.shape}")
print(f"Autoencoder MSE shape: {autoencoder_mse.shape}")
print(f"KMeans distances shape: {min_distances.shape}")

🔹 XGBoost predicted probabilities (first 10):
[0.40991944 0.45637968 0.44487512 0.45475    0.49440458 0.39849377
 0.47004652 0.39340958 0.4759608  0.43185726]

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
🔹 Autoencoder reconstruction errors (first 10):
[0.10259487 0.12237255 0.0419256  0.16071701 0.16794523 0.18830463
 0.27964411 0.20718351 0.17578931 0.08698431]

🔹 KMeans minimum distances to centroid (first 10):
[4.59866986 4.88620915 5.04265305 5.17536084 4.58525566 4.95374502
 4.09789261 5.02043555 4.04892658 5.49522128]

✅ Summary:
XGBoost probs shape: (5000,)
Autoencoder MSE shape: (5000,)
KMeans distances shape: (5000,)


In [21]:
# --- 2️⃣ Create meta-feature dataframe ---
meta_features = pd.DataFrame({
    "xgb_prob": xgb_probs,
    "autoencoder_mse": autoencoder_mse,
    "kmeans_min_dist": min_distances
})

# --- 3️⃣ Split meta-features for meta-model training ---
X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(
    meta_features, y, test_size=0.2, random_state=42
)

print("Data ready for model training!")
print("Train size:", X_train_meta.shape)
print("Test size:", X_test_meta.shape)


Data ready for model training!
Train size: (4000, 3)
Test size: (1000, 3)


#### Training RF for balance dataset and with CV

In [40]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# --- 1️⃣ Cross-validation utility (SMOTE inside CV) ---
def evaluate_cv_meta(estimator, X, y, cv=5):
    """
    Evaluate a meta-model using StratifiedKFold CV with SMOTE inside.
    """
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision', 'recall', 'f1']

    pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('clf', estimator)
    ])

    cv_results = cross_validate(
        pipeline, X, y,
        cv=skf,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )

    print("\n📊 Cross-validation results:")
    for metric in scoring:
        key = f'test_{metric}'
        scores = cv_results[key]
        print(f"  {metric:<10}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

    return cv_results

# --- 2️⃣ Candidate meta-models ---
meta_model_1 = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42, class_weight='balanced')
}

# --- 3️⃣ Run evaluation ---
results = {}

for name, model in meta_model_1.items():
    print(f"\n===== {name} (with SMOTE inside CV) =====")
    cv_results = evaluate_cv_meta(model, X_train_meta, y_train_meta, cv=5)

    # Fit on full training data after CV
    final_pipe = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('clf', model)
    ])
    final_pipe.fit(X_train_meta, y_train_meta)

    # Evaluate on test set
    y_pred = final_pipe.predict(X_test_meta)
    acc = accuracy_score(y_test_meta, y_pred)
    results[name] = acc

    print("\n🧾 Test set classification report:")
    print(classification_report(y_test_meta, y_pred))
    print(f"✅ Test Accuracy: {acc:.4f}")

# --- 4️⃣ Compare results ---
print("\n=== Accuracy Comparison ===")
for name, acc in results.items():
    print(f"{name:25s}: {acc:.4f}")



===== Random Forest (with SMOTE inside CV) =====

📊 Cross-validation results:
  accuracy  : 0.479 ± 0.016
  precision : 0.261 ± 0.010
  recall    : 0.611 ± 0.055
  f1        : 0.365 ± 0.018





🧾 Test set classification report:
              precision    recall  f1-score   support

           0       0.77      0.40      0.53       743
           1       0.28      0.66      0.39       257

    accuracy                           0.47      1000
   macro avg       0.52      0.53      0.46      1000
weighted avg       0.65      0.47      0.49      1000

✅ Test Accuracy: 0.4660

=== Accuracy Comparison ===
Random Forest            : 0.4660


#### Training LR for balance dataset and with CV

In [41]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# --- 1️⃣ Cross-validation utility (SMOTE inside CV) ---
def evaluate_cv_meta(estimator, X, y, cv=5):
    """
    Evaluate a meta-model using StratifiedKFold CV with SMOTE inside.
    """
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision', 'recall', 'f1']

    pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('clf', estimator)
    ])

    cv_results = cross_validate(
        pipeline, X, y,
        cv=skf,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )

    print("\n📊 Cross-validation results:")
    for metric in scoring:
        key = f'test_{metric}'
        scores = cv_results[key]
        print(f"  {metric:<10}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

    return cv_results

# --- 2️⃣ Candidate meta-models ---
meta_model_2 = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

# --- 3️⃣ Run evaluation ---
results = {}

for name, model in meta_model_2.items():
    print(f"\n===== {name} (with SMOTE inside CV) =====")
    cv_results = evaluate_cv_meta(model, X_train_meta, y_train_meta, cv=5)

    # Fit on full training data after CV
    final_pipe = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('clf', model)
    ])
    final_pipe.fit(X_train_meta, y_train_meta)

    # Evaluate on test set
    y_pred = final_pipe.predict(X_test_meta)
    acc = accuracy_score(y_test_meta, y_pred)
    results[name] = acc

    print("\n🧾 Test set classification report:")
    print(classification_report(y_test_meta, y_pred))
    print(f"✅ Test Accuracy: {acc:.4f}")

# --- 4️⃣ Compare results ---
print("\n=== Accuracy Comparison ===")
for name, acc in results.items():
    print(f"{name:25s}: {acc:.4f}")



===== Logistic Regression (with SMOTE inside CV) =====

📊 Cross-validation results:
  accuracy  : 0.523 ± 0.012
  precision : 0.268 ± 0.018
  recall    : 0.545 ± 0.058
  f1        : 0.359 ± 0.029

🧾 Test set classification report:
              precision    recall  f1-score   support

           0       0.77      0.52      0.62       743
           1       0.28      0.55      0.37       257

    accuracy                           0.52      1000
   macro avg       0.52      0.53      0.49      1000
weighted avg       0.64      0.52      0.55      1000

✅ Test Accuracy: 0.5240

=== Accuracy Comparison ===
Logistic Regression      : 0.5240




#### Training Gradient Boosting and SVM for balance dataset and with CV

In [42]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# --- 1️⃣ Cross-validation utility (SMOTE inside CV) ---
def evaluate_cv_meta(estimator, X, y, cv=5):
    """
    Evaluate a meta-model using StratifiedKFold CV with SMOTE inside.
    """
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision', 'recall', 'f1']

    pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('clf', estimator)
    ])

    cv_results = cross_validate(
        pipeline, X, y,
        cv=skf,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )

    print("\n📊 Cross-validation results:")
    for metric in scoring:
        key = f'test_{metric}'
        scores = cv_results[key]
        print(f"  {metric:<10}: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

    return cv_results

# --- 2️⃣ Candidate meta-models ---
meta_model_3 = {
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42),
    "SVM (RBF)": SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
}

# --- 3️⃣ Run evaluation ---
results = {}

for name, model in meta_model_3.items():
    print(f"\n===== {name} (with SMOTE inside CV) =====")
    cv_results = evaluate_cv_meta(model, X_train_meta, y_train_meta, cv=5)

    # Fit on full training data after CV
    final_pipe = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('clf', model)
    ])
    final_pipe.fit(X_train_meta, y_train_meta)

    # Evaluate on test set
    y_pred = final_pipe.predict(X_test_meta)
    acc = accuracy_score(y_test_meta, y_pred)
    results[name] = acc

    print("\n🧾 Test set classification report:")
    print(classification_report(y_test_meta, y_pred))
    print(f"✅ Test Accuracy: {acc:.4f}")

# --- 4️⃣ Compare results ---
print("\n=== Accuracy Comparison ===")
for name, acc in results.items():
    print(f"{name:25s}: {acc:.4f}")



===== Gradient Boosting (with SMOTE inside CV) =====

📊 Cross-validation results:
  accuracy  : 0.534 ± 0.016
  precision : 0.262 ± 0.008
  recall    : 0.494 ± 0.033
  f1        : 0.342 ± 0.013





🧾 Test set classification report:
              precision    recall  f1-score   support

           0       0.76      0.54      0.63       743
           1       0.28      0.51      0.36       257

    accuracy                           0.54      1000
   macro avg       0.52      0.53      0.50      1000
weighted avg       0.64      0.54      0.56      1000

✅ Test Accuracy: 0.5350

===== SVM (RBF) (with SMOTE inside CV) =====

📊 Cross-validation results:
  accuracy  : 0.545 ± 0.048
  precision : 0.258 ± 0.005
  recall    : 0.448 ± 0.094
  f1        : 0.324 ± 0.022





🧾 Test set classification report:
              precision    recall  f1-score   support

           0       0.78      0.19      0.31       743
           1       0.26      0.84      0.40       257

    accuracy                           0.36      1000
   macro avg       0.52      0.52      0.35      1000
weighted avg       0.64      0.36      0.33      1000

✅ Test Accuracy: 0.3580

=== Accuracy Comparison ===
Gradient Boosting        : 0.5350
SVM (RBF)                : 0.3580


| Model                   | Accuracy (Test) | Recall (Fraud=1) | F1 (Fraud=1) | Notes                                                                                                                 |
| ----------------------- | --------------- | ---------------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| **Random Forest**       | 0.466           | 0.66             | 0.39         | High recall for fraud, moderate F1, overall accuracy low                                                              |
| **Logistic Regression** | 0.524           | 0.55             | 0.37         | Slightly better overall accuracy, lower fraud recall than RF                                                          |
| **Gradient Boosting**   | 0.535           | 0.51             | 0.36         | Similar to LR, but slightly worse recall                                                                              |
| **SVM (RBF)**           | 0.358           | 0.84             | 0.40         | **Highest fraud recall**, F1 slightly better than RF, very low overall accuracy (predicts most non-fraud incorrectly) |


In [43]:
import pickle

# If Gradient Boosting pipeline used, find it in results loops. For simplicity we'll train a final pipeline here
final_model = RandomForestClassifier(random_state=42)
final_pipe = ImbPipeline([('smote', SMOTE()), ('clf', final_model)])
final_pipe.fit(X_train_meta, y_train_meta)


# Save the previously trained Random Forest model
with open('final_rf_pipeline.pkl', 'wb') as f:
    pickle.dump(final_pipe, f)

print("✅ Random Forest model saved as 'final_rf_pipeline.pkl'")




✅ Random Forest model saved as 'final_rf_pipeline.pkl'


In [44]:
# Load all trained models

xgb_model = pickle.load(open('../Ensemble_Model/Base_Models/xgb_model.pkl', 'rb'))
kmeans_model = pickle.load(open('../Ensemble_Model/Base_Models/kmeans_model.pkl', 'rb'))
autoencoder_model = load_model(
    '../Ensemble_Model/Base_Models/autoencoder_model.h5',
    custom_objects={"mse": tf.keras.losses.MeanSquaredError()},
    compile=True
)
meta_model = pickle.load(open('../Ensemble_Model/final_rf_pipeline.pkl', 'rb'))



### Sample Input

In [46]:
# --- Example Input ---
example_input = {
    "device_type": "Desktop",
    "browser": "Firefox",
    "operating_system": "Windows",
    "click_duration": 3.2,
    "scroll_depth": 60,
    "mouse_movement": 120,
    "keystrokes_detected": 40,
    "click_frequency": 7,
    "time_since_last_click": 72,
    "device_ip_reputation": "Good",
    "VPN_usage": 0,
    "proxy_usage": 1,
    "ip_country": "United States",
    "ip_city": "Los Angeles",
    "ip_asn": "AS15169",
    "ip_org": "Google LLC",
    "ip_is_datacenter": 0,
    "fraud_score": 3
}

example_df = pd.DataFrame([example_input])

In [47]:
# --- Clean and handle categorical features ---
categorical_cols = [
    "device_type", "browser", "operating_system",
    "device_ip_reputation", "ip_country", "ip_city",
    "ip_asn", "ip_org"
]

# Handle unseen labels gracefully
for col in categorical_cols:
    if "Unknown" not in label_encoders[col].classes_:
        label_encoders[col].classes_ = np.append(label_encoders[col].classes_, "Unknown")

    value = example_df[col].iloc[0]
    if value not in label_encoders[col].classes_:
        example_df[col] = ["Unknown"]
    example_df[col] = label_encoders[col].transform(example_df[col])

    print(f"Encoded {col}: {example_df[col].iloc[0]}")

Encoded device_type: 0
Encoded browser: 2
Encoded operating_system: 2
Encoded device_ip_reputation: 1
Encoded ip_country: 117
Encoded ip_city: 725
Encoded ip_asn: 1228
Encoded ip_org: 1148


In [51]:
# --- Numeric & Derived Features ---
numeric_cols = [
    "device_type", "browser", "operating_system",
    "click_duration", "scroll_depth", "mouse_movement",
    "keystrokes_detected", "click_frequency",
    "time_since_last_click", "device_ip_reputation",
    "VPN_usage", "proxy_usage",
    "ip_country", "ip_city", "ip_asn", "ip_org", "ip_is_datacenter",
    "click_intensity", "engagement_ratio", "inactivity_ratio",
    "input_activity", "ip_risk_flag", "fraud_score"
]

epsilon = 1e-5
example_df['click_duration'] = example_df['click_duration'].fillna(0)
example_df['click_frequency'] = example_df['click_frequency'].fillna(0)
example_df['scroll_depth'] = example_df['scroll_depth'].fillna(0)
example_df['mouse_movement'] = example_df['mouse_movement'].fillna(0)
example_df['keystrokes_detected'] = example_df['keystrokes_detected'].fillna(0)
example_df['time_since_last_click'] = example_df['time_since_last_click'].fillna(0)

example_df['click_intensity'] = example_df['click_frequency'] / (example_df['click_duration'] + epsilon)
example_df['engagement_ratio'] = (example_df['scroll_depth'] + example_df['mouse_movement']) / (example_df['click_duration'] + 1)
example_df['inactivity_ratio'] = example_df['time_since_last_click'] / (example_df['click_duration'] + 1)
example_df['input_activity'] = example_df['keystrokes_detected'] + example_df['mouse_movement']

if example_df['device_ip_reputation'].dtype == object:
    example_df['ip_risk_flag'] = (
        (example_df['device_ip_reputation'].isin(['Poor', 'Unknown'])) |
        (example_df['ip_is_datacenter'] == 1) |
        (example_df['VPN_usage'] == 1) |
        (example_df['proxy_usage'] == 1)
    ).astype(int)
else:
    example_df['ip_risk_flag'] = (
        (example_df.get('ip_is_datacenter', 0) == 1) |
        (example_df.get('VPN_usage', 0) == 1) |
        (example_df.get('proxy_usage', 0) == 1)
    ).astype(int)

# --- Scale numeric features ---
example_df[numeric_cols] = scaler.transform(example_df[numeric_cols].fillna(0))
example_df = example_df[numeric_cols]  # reorder columns

# --- Generate base model outputs ---
# 1️⃣ XGBoost probability
xgb_prob = xgb_model.predict_proba(example_df)[:, 1][0]

# 2️⃣ Autoencoder reconstruction error
reconstructed = autoencoder_model.predict(example_df)
autoencoder_mse = np.mean(np.power(example_df.values - reconstructed, 2), axis=1)[0]

# 3️⃣ KMeans distance to nearest cluster
distances = kmeans_model.transform(example_df)
kmeans_min_dist = np.min(distances, axis=1)[0]

# --- Combine into meta-feature vector ---
meta_features_input = pd.DataFrame([{
    "xgb_prob": xgb_prob,
    "autoencoder_mse": autoencoder_mse,
    "kmeans_min_dist": kmeans_min_dist
}])

# --- Predict using saved Random Forest meta-model ---
predicted_label = meta_model.predict(meta_features_input)[0]
predicted_prob = (
    meta_model.predict_proba(meta_features_input)[0, 1]
    if hasattr(meta_model, "predict_proba")
    else None
)

# --- Output Results ---
print("\n🧠 Meta-Feature Vector:")
print(meta_features_input)
print("\n🎯 Predicted Class:", "Fraudulent" if predicted_label == 1 else "Legitimate")
if predicted_prob is not None:
    print(f"🔹 Predicted Probability of Fraud: {predicted_prob:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step

🧠 Meta-Feature Vector:
   xgb_prob  autoencoder_mse  kmeans_min_dist
0  0.484639    153761.433184      3696.543541

🎯 Predicted Class: Legitimate
🔹 Predicted Probability of Fraud: 0.3600
