In [40]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler


df = pd.read_csv('../../click_fraud_dataset.csv')

In [41]:
# Drop unnecessary columns
drop_cols = ["click_id", "timestamp", "user_id", "referrer_url", "page_url", "ad_position", "bot_likelihood_score"]
df = df.drop(columns=drop_cols)

In [42]:
import geoip2.database

city_db = "../GeoLite2/GeoLite2-City.mmdb"
asn_db = "../GeoLite2/GeoLite2-ASN.mmdb"

reader_city = geoip2.database.Reader(city_db)
reader_asn = geoip2.database.Reader(asn_db)

def enrich_ip(ip):
    data = {
        "ip_country": None,
        "ip_city": None,
        "ip_asn": None,
        "ip_org": None,
        "ip_is_datacenter": 0
    }
    try:
        city = reader_city.city(ip)
        asn = reader_asn.asn(ip)

        data["ip_country"] = city.country.name
        data["ip_city"] = city.city.name
        data["ip_asn"] = asn.autonomous_system_number
        data["ip_org"] = asn.autonomous_system_organization

        # crude datacenter detection
        dc_keywords = ['Amazon', 'Google', 'OVH', 'Microsoft', 'DigitalOcean', 'Hetzner', 'Linode']
        if any(k in (asn.autonomous_system_organization or "") for k in dc_keywords):
            data["ip_is_datacenter"] = 1
    except Exception:
        pass
    return pd.Series(data)


In [43]:
# Example use:
ip_info = enrich_ip("203.94.72.111")
print(ip_info)

ip_country                           Sri Lanka
ip_city                                Colombo
ip_asn                                    9329
ip_org              Sri Lanka Telecom Internet
ip_is_datacenter                             0
dtype: object


In [44]:
df = df.join(df['ip_address'].apply(enrich_ip))
df.head()

Unnamed: 0,ip_address,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,device_ip_reputation,VPN_usage,proxy_usage,is_fraudulent,ip_country,ip_city,ip_asn,ip_org,ip_is_datacenter
0,141.36.49.37,Tablet,Safari,Android,0.29,60,111,8,7,72,Good,0,1,0,,,,,0.0
1,216.29.19.201,Desktop,Opera,iOS,0.64,25,452,29,9,201,Suspicious,0,0,0,United States,,174.0,COGENT-174,0.0
2,167.133.41.231,Tablet,Safari,Linux,0.42,36,431,18,9,326,Good,0,1,0,,,,,0.0
3,216.146.33.78,Tablet,Edge,macOS,4.29,29,472,37,4,33,Suspicious,0,0,0,United States,Ashburn,31898.0,ORACLE-BMC-31898,0.0
4,146.37.54.245,Desktop,Opera,Windows,2.46,94,50,2,7,97,Good,0,0,0,,,,,0.0


In [45]:
from user_agents import parse

def parse_user_agent(ua_str):
    try:
        ua = parse(ua_str)
        return pd.Series({
            "ua_browser": ua.browser.family,
            "ua_os": ua.os.family,
            "ua_device": ua.device.family,
            "ua_is_mobile": ua.is_mobile,
            "ua_is_tablet": ua.is_tablet,
            "ua_is_pc": ua.is_pc,
            "ua_is_bot": ua.is_bot
        })
    except Exception:
        return pd.Series({
            "ua_browser": None,
            "ua_os": None,
            "ua_device": None,
            "ua_is_mobile": False,
            "ua_is_tablet": False,
            "ua_is_pc": False,
            "ua_is_bot": False
        })

# Example:
ua_info = parse_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/117.0.0.0 Safari/537.36")
print(ua_info)


ua_browser       Chrome
ua_os           Windows
ua_device         Other
ua_is_mobile      False
ua_is_tablet      False
ua_is_pc           True
ua_is_bot         False
dtype: object


In [46]:
import ipaddress

def is_private_ip(ip):
    try:
        return ipaddress.ip_address(ip).is_private
    except:
        return False

def fraud_score(row):
    score = 0

    # 1️⃣ IP-based
    if row.get("ip_is_datacenter") == 1:
        score += 3
    if is_private_ip(row.get("ip_address", "")):
        score += 2
    if row.get("VPN_usage") == 1 or row.get("proxy_usage") == 1:
        score += 2

    # 2️⃣ Device & OS mismatch
    os = str(row.get("operating_system", "")).lower()
    browser = str(row.get("browser", "")).lower()
    device = str(row.get("device_type", "")).lower()

    if "windows" in os and "safari" in browser:
        score += 2
    if device == "mobile" and row.get("mouse_movement", 0) > 0:
        score += 1

    # 3️⃣ User-agent analysis (if parsed)
    if row.get("ua_is_bot", False):
        score += 4

    # 4️⃣ Behavior
    if row.get("click_duration", 10) < 1.0:
        score += 1

    return score

# Apply to dataset
df["fraud_score"] = df.apply(fraud_score, axis=1)

# Decide labels
df["predicted_label"] = (df["fraud_score"] >= 4).astype(int)  # 1=fraudulent, 0=genuine

df[["ip_address", "ip_country", "device_type", "browser", "operating_system", "fraud_score", "predicted_label"]].head()


Unnamed: 0,ip_address,ip_country,device_type,browser,operating_system,fraud_score,predicted_label
0,141.36.49.37,,Tablet,Safari,Android,3,0
1,216.29.19.201,United States,Desktop,Opera,iOS,1,0
2,167.133.41.231,,Tablet,Safari,Linux,3,0
3,216.146.33.78,United States,Tablet,Edge,macOS,0,0
4,146.37.54.245,,Desktop,Opera,Windows,0,0


In [47]:
# Convert all ASN values to string and fill NaN with "Unknown"
df["ip_asn"] = df["ip_asn"].astype(str).fillna("Unknown")

# Optional: Clean numeric-only ASNs (make them consistent like 'AS12345')
df["ip_asn"] = df["ip_asn"].apply(lambda x: f"AS{x}" if x.isdigit() else x)
df.head()


Unnamed: 0,ip_address,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,...,VPN_usage,proxy_usage,is_fraudulent,ip_country,ip_city,ip_asn,ip_org,ip_is_datacenter,fraud_score,predicted_label
0,141.36.49.37,Tablet,Safari,Android,0.29,60,111,8,7,72,...,0,1,0,,,,,0.0,3,0
1,216.29.19.201,Desktop,Opera,iOS,0.64,25,452,29,9,201,...,0,0,0,United States,,174.0,COGENT-174,0.0,1,0
2,167.133.41.231,Tablet,Safari,Linux,0.42,36,431,18,9,326,...,0,1,0,,,,,0.0,3,0
3,216.146.33.78,Tablet,Edge,macOS,4.29,29,472,37,4,33,...,0,0,0,United States,Ashburn,31898.0,ORACLE-BMC-31898,0.0,0,0
4,146.37.54.245,Desktop,Opera,Windows,2.46,94,50,2,7,97,...,0,0,0,,,,,0.0,0,0


In [48]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoders = {}
for col in ["device_type", "browser", "operating_system", "device_ip_reputation", "ip_country", "ip_city", "ip_asn", "ip_org"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()

Unnamed: 0,ip_address,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,...,VPN_usage,proxy_usage,is_fraudulent,ip_country,ip_city,ip_asn,ip_org,ip_is_datacenter,fraud_score,predicted_label
0,141.36.49.37,2,4,0,0.29,60,111,8,7,72,...,0,1,0,122,1417,1227,1147,0.0,3,0
1,216.29.19.201,0,3,3,0.64,25,452,29,9,201,...,0,0,0,117,1416,251,179,0.0,1,0
2,167.133.41.231,2,4,1,0.42,36,431,18,9,326,...,0,1,0,122,1417,1227,1147,0.0,3,0
3,216.146.33.78,2,1,4,4.29,29,472,37,4,33,...,0,0,0,117,61,615,689,0.0,0,0
4,146.37.54.245,0,3,2,2.46,94,50,2,7,97,...,0,0,0,122,1417,1227,1147,0.0,0,0


In [49]:
# Drop ip_address after enrichment
df = df.drop(columns=['ip_address'])
df.head()

Unnamed: 0,device_type,browser,operating_system,click_duration,scroll_depth,mouse_movement,keystrokes_detected,click_frequency,time_since_last_click,device_ip_reputation,VPN_usage,proxy_usage,is_fraudulent,ip_country,ip_city,ip_asn,ip_org,ip_is_datacenter,fraud_score,predicted_label
0,2,4,0,0.29,60,111,8,7,72,1,0,1,0,122,1417,1227,1147,0.0,3,0
1,0,3,3,0.64,25,452,29,9,201,2,0,0,0,117,1416,251,179,0.0,1,0
2,2,4,1,0.42,36,431,18,9,326,1,0,1,0,122,1417,1227,1147,0.0,3,0
3,2,1,4,4.29,29,472,37,4,33,2,0,0,0,117,61,615,689,0.0,0,0
4,0,3,2,2.46,94,50,2,7,97,1,0,0,0,122,1417,1227,1147,0.0,0,0


In [50]:
df.shape

(5000, 20)

In [51]:
from sklearn.preprocessing import StandardScaler

# Define numeric columns
numeric_cols = ['click_duration',
                'scroll_depth',
                'mouse_movement',
                'keystrokes_detected',
                'click_frequency',
                'time_since_last_click',
                'device_ip_reputation',
                'device_type',
                'browser',
                'operating_system',
                'ip_country',
                'ip_city',
                'ip_asn',
                'ip_org',
                'VPN_usage',
                'proxy_usage',
                'ip_is_datacenter']

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("Scaling completed!")


Scaling completed!


In [52]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["is_fraudulent", "predicted_label"])

# Option A
y = df["is_fraudulent"]

# Option B
# y = df["predicted_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Data ready for model training!")
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Data ready for model training!
Train size: (4000, 18)
Test size: (1000, 18)


In [53]:
# Load all trained base models

with open('../Ensemble_Model/Base_Models/xgb_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

with open('../Ensemble_Model/Base_Models/kmeans_model.pkl', 'rb') as f:
    kmeans_model = pickle.load(f)

autoencoder = load_model(
    '../Ensemble_Model/Base_Models/autoencoder_model.h5',
    custom_objects={"mse": tf.keras.losses.MeanSquaredError()},
    compile=True
)



In [54]:
# Generate base model predictions

# 1️⃣ XGBoost prediction probability
xgb_probs = xgb_model.predict_proba(X)[:, 1]

# 2️⃣ Autoencoder reconstruction error
reconstructed = autoencoder.predict(X)
autoencoder_mse = np.mean(np.power(X.values - reconstructed, 2), axis=1)

# 3️⃣ KMeans distance to closest centroid
distances = kmeans_model.transform(X)
min_distances = np.min(distances, axis=1)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [61]:
# --- 2️⃣ Create meta-feature dataframe ---
meta_features = pd.DataFrame({
    "xgb_prob": xgb_probs,
    "autoencoder_mse": autoencoder_mse,
    "kmeans_min_dist": min_distances
})

# --- 3️⃣ Split meta-features for meta-model training ---
X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(
    meta_features, y, test_size=0.2, random_state=42
)

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [62]:
# Define candidate meta-models
meta_models_1 = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate each model
results = {}

for name, model in meta_models_1.items():
    model.fit(X_train_meta, y_train_meta)
    y_pred = model.predict(X_test_meta)
    acc = accuracy_score(y_test_meta, y_pred)
    results[name] = acc
    print(f"\n{name} Results:")
    print(classification_report(y_test_meta, y_pred))
    print(f"Accuracy: {acc:.4f}")

# Compare models
print("\nAccuracy Comparison:")
for name, acc in results.items():
    print(f"{name:25s}: {acc:.4f}")


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89       743
           1       0.81      0.45      0.58       257

    accuracy                           0.83      1000
   macro avg       0.82      0.71      0.74      1000
weighted avg       0.83      0.83      0.81      1000

Accuracy: 0.8310

Random Forest Results:
              precision    recall  f1-score   support

           0       0.84      0.93      0.89       743
           1       0.72      0.50      0.59       257

    accuracy                           0.82      1000
   macro avg       0.78      0.72      0.74      1000
weighted avg       0.81      0.82      0.81      1000

Accuracy: 0.8220

Accuracy Comparison:
Logistic Regression      : 0.8310
Random Forest            : 0.8220


In [63]:
# Define candidate meta-models
meta_models_2 = {
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM (RBF)": SVC(kernel='rbf', probability=True, random_state=42)
}

# Train and evaluate each model
results = {}

for name, model in meta_models_2.items():
    model.fit(X_train_meta, y_train_meta)
    y_pred = model.predict(X_test_meta)
    acc = accuracy_score(y_test_meta, y_pred)
    results[name] = acc
    print(f"\n{name} Results:")
    print(classification_report(y_test_meta, y_pred))
    print(f"Accuracy: {acc:.4f}")

# Compare models
print("\nAccuracy Comparison:")
for name, acc in results.items():
    print(f"{name:25s}: {acc:.4f}")


Gradient Boosting Results:
              precision    recall  f1-score   support

           0       0.85      0.94      0.89       743
           1       0.74      0.52      0.61       257

    accuracy                           0.83      1000
   macro avg       0.80      0.73      0.75      1000
weighted avg       0.82      0.83      0.82      1000

Accuracy: 0.8300

SVM (RBF) Results:
              precision    recall  f1-score   support

           0       0.78      0.99      0.87       743
           1       0.92      0.18      0.31       257

    accuracy                           0.79      1000
   macro avg       0.85      0.59      0.59      1000
weighted avg       0.82      0.79      0.73      1000

Accuracy: 0.7860

Accuracy Comparison:
Gradient Boosting        : 0.8300
SVM (RBF)                : 0.7860


In [64]:
# Save Logistic Regression model
import pickle

with open('../Ensemble_Model/stacking_meta_model.pkl', 'wb') as f:
    pickle.dump(meta_models_2["Gradient Boosting"], f)

print("\nStacking Ensemble Model Saved Successfully.")


Stacking Ensemble Model Saved Successfully.


In [66]:
# Load all trained models

xgb_model = pickle.load(open('../Ensemble_Model/Base_Models/xgb_model.pkl', 'rb'))
kmeans_model = pickle.load(open('../Ensemble_Model/Base_Models/kmeans_model.pkl', 'rb'))
autoencoder_model = load_model(
    '../Ensemble_Model/Base_Models/autoencoder_model.h5',
    custom_objects={"mse": tf.keras.losses.MeanSquaredError()},
    compile=True
)
meta_model = pickle.load(open('../Ensemble_Model/stacking_meta_model.pkl', 'rb'))



### Sample Input

In [67]:
# --- 1️⃣ Prepare sample input ---
sample_input = {
    "device_type": 2,
    "browser": 3,
    "operating_system": 1,
    "click_duration": 4.5,
    "scroll_depth": 300,
    "mouse_movement": 1200,
    "keystrokes_detected": 25,
    "click_frequency": 1.2,
    "time_since_last_click": 0.8,
    "device_ip_reputation": 0.6,
    "VPN_usage": 0,
    "proxy_usage": 0,
    "ip_country": 5,
    "ip_city": 8,
    "ip_asn": 2,
    "ip_org": 1,
    "ip_is_datacenter": 0,
    "fraud_score": 0.4,
}

example_df = pd.DataFrame([sample_input])

# --- 2️⃣ Preprocess same as training ---
numeric_cols = list(sample_input.keys())

# ❗ Use the same scaler used in training (DO NOT FIT AGAIN)
example_df[numeric_cols] = scaler.fit_transform(example_df[numeric_cols])

# --- 3️⃣ Generate base model outputs ---

# a) Autoencoder: compute reconstruction error
reconstructed = autoencoder.predict(example_df)
autoencoder_mse = np.mean(np.power(example_df - reconstructed, 2), axis=1)

# b) KMeans: get cluster distance (or label)
min_dist = np.min(kmeans_model.transform(example_df), axis=1)

# c) XGBoost base prediction
xgb_prob = xgb_model.predict_proba(example_df)[:, 1]  # probability of fraud

# --- 4️⃣ Combine base model features into one meta-feature vector ---
meta_features = pd.DataFrame({
    "xgb_prob": xgb_prob,
    "autoencoder_mse": autoencoder_mse,
    "kmeans_min_dist": min_dist
})

# --- 5️⃣ Make final stacking prediction ---
final_pred = meta_model.predict(meta_features)
final_prob = meta_model.predict_proba(meta_features)[:, 1]

print(f"✅ Final Ensemble Prediction: {'Fraudulent' if final_pred[0] == 1 else 'Genuine'}")
print(f"🔹 Fraud Probability: {final_prob[0]:.4f}")
print(f"🔸 Autoencoder Error: {autoencoder_mse[0]:.6f}")
print(f"🔸 KMeans Min Dist: {min_dist[0]:.4f}")
print(f"🔸 XGB Probability: {xgb_prob[0]:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
✅ Final Ensemble Prediction: Fraudulent
🔹 Fraud Probability: 0.7581
🔸 Autoencoder Error: 0.097274
🔸 KMeans Min Dist: 0.9258
🔸 XGB Probability: 0.7775
