In [1]:
import pandas as pd
import os

file_paths = [
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\03-01-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\02-28-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\02-21-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\02-16-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\03-02-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\02-14-2018.csv"
]

df_list = [pd.read_csv(fp,low_memory=False) for fp in file_paths]
df = pd.concat(df_list, ignore_index=True)

print(f"Combined shape: {df.shape}")

Combined shape: (5138529, 80)


In [2]:
df_sample = df.sample(n=100_000, random_state=42).copy()

from sklearn.preprocessing import LabelEncoder

# Extract and encode the target labels
y_raw = df_sample['Label']
le = LabelEncoder()
y = le.fit_transform(y_raw)


In [3]:
# Convert to numeric
df_converted = df_sample.apply(pd.to_numeric, errors='coerce')

# Drop columns with >50% missing values
threshold = 0.5
df_reduced = df_converted.loc[:, df_converted.isnull().mean() < threshold]

# Fill remaining NaNs with column means
df_cleaned = df_reduced.copy()
for col in df_cleaned.columns:
    if df_cleaned[col].isnull().any():
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mean())

print(f"Cleaned shape: {df_cleaned.shape}")

Cleaned shape: (100000, 78)


In [4]:
import numpy as np

# Check for NaNs
print("NaNs:", df_cleaned.isna().sum().sum())

# Check for Infs
print("Infs:",np.isinf(df_cleaned.values).sum())

NaNs: 0
Infs: 652


Since the count is zero we need to clean them up

##### Clean the data

In [5]:
# Replace inf/-inf with NaN
df_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill remaining NaNs with column means
df_cleaned = df_cleaned.apply(lambda col: col.fillna(col.mean()) if col.dtype != 'object' else col)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cleaned.select_dtypes(include=[np.number]))


### Interaction Feature

In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

input_dim = X_scaled.shape[1]
encoding_dim = 32  # You can tune this

# Define layers
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='linear')(encoded)

# Build model
autoencoder = Model(inputs=input_layer, outputs=decoded)
encoder = Model(inputs=input_layer, outputs=encoded)

# Compile and train
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=20, batch_size=256, shuffle=True)

# Get compressed features
X_encoded = encoder.predict(X_scaled)


Epoch 1/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.4201
Epoch 2/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.1395
Epoch 3/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0771
Epoch 4/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0486
Epoch 5/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0378
Epoch 6/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0291
Epoch 7/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0244
Epoch 8/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0237
Epoch 9/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0193
Epoch 10/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - lo

In [8]:
import pandas as pd
import numpy as np

# Select numeric columns
numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns

# Compute correlation matrix
corr_matrix = df_cleaned[numeric_cols].corr().abs()

# Get upper triangle of correlation matrix
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find pairs with correlation > 0.7 (tune this threshold as needed)
high_corr_pairs = [(col1, col2) for col1 in upper_tri.columns for col2 in upper_tri.index if upper_tri.loc[col2, col1] > 0.7]

# Create interaction features
for col1, col2 in high_corr_pairs:
    new_col_name = f"{col1}_x_{col2}"
    df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
    
df_cleaned = df_cleaned.copy()

print(f"✅ Created {len(high_corr_pairs)} interaction features.")


  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name

✅ Created 137 interaction features.


In [9]:
df_cleaned = df_cleaned.copy()


### Train and Extract Embedding

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# Define input dimension
input_dim = X_scaled.shape[1]
encoding_dim = 32  # Bottleneck size

# Encoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(64, activation='relu')(encoded)
bottleneck = Dense(encoding_dim, activation='relu')(encoded)

# Decoder
decoded = Dense(64, activation='relu')(bottleneck)
decoded = Dense(128, activation='relu')(decoded)
output_layer = Dense(input_dim, activation='linear')(decoded)

# Build models
autoencoder = Model(inputs=input_layer, outputs=output_layer)
encoder = Model(inputs=input_layer, outputs=bottleneck)

# Compile and train
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=20, batch_size=256, shuffle=True, verbose=1)

# Extract embeddings
X_embeddings = encoder.predict(X_scaled)

# Add embeddings to DataFrame
for i in range(encoding_dim):
    df_cleaned[f'embed_{i}'] = X_embeddings[:, i]

print(f"✅ Added {encoding_dim} autoencoder embeddings to your dataset.")

Epoch 1/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 0.2446
Epoch 2/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0709
Epoch 3/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0503
Epoch 4/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0870
Epoch 5/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0306
Epoch 6/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0382
Epoch 7/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0684
Epoch 8/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.0417
Epoch 9/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0469
Epoch 10/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - lo

In [13]:
%pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# 1. Prepare Final Feature Matrix
X_final = df_cleaned.copy()
y_final = df_sample['Label'] 
assert len(X_final) == len(y_final), "Mismatch in feature and label lengths!"

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_final_encoded = le.fit_transform(y_final)

# 2. Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final_encoded, test_size=0.2, stratify=y_final_encoded, random_state = 42)

# 3. Class Balancing with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# 4. Define Models
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "HGB": HistGradientBoostingClassifier(),
    "TabNet": TabNetClassifier(verbose=0, device_name='cuda' if torch.cuda.is_available() else 'cpu')
}

# 5. Train and Evaluate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import label_binarize

results = {}

for name, model in models.items():
    print(f"\n 🔍 Training {name}...")

    if name == "TabNet":
        model.fit(X_train_balanced.values, y_train_balanced, eval_set=[(X_test.values, y_test)], patience=10)
        y_pred = model.predict(X_test.values)
        y_proba = model.predict_proba(X_test.values)
    else:
        model.fit(X_train_balanced, y_train_balanced)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)

        # Pad y_proba if some classes are missing
        expected_classes = np.unique(y_train_balanced)
        if y_proba.shape[1] != len(expected_classes):
            missing = len(expected_classes) - y_proba.shape[1]
            pad = np.zeros((y_proba.shape[0], missing))
            y_proba = np.hstack([y_proba, pad])


    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc = roc_auc_score(label_binarize(y_test, classes=np.unique(y_test)), y_proba, average='weighted', multi_class='ovr')

    results[name] = {"Accuracy": acc, "F1 Score": f1, "ROC-AUC": roc}
    print(classification_report(y_test, y_pred, target_names=le.classes_))

# 6. Display Results
for model_name, scores in results.items():
    print(f"\n 📊 {model_name} Results:")
    for metric, value in scores.items():
        print(f"{metric}: {value:.4f}")

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
