In [27]:
import pandas as pd
import os

file_paths = [
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\03-01-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\02-28-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\02-21-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\02-16-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\03-02-2018.csv",
    r"C:\Users\Aadya Nair\OneDrive\Documents\Projects\Dataset\02-14-2018.csv"
]

df_list = [pd.read_csv(fp,low_memory=False) for fp in file_paths]
df = pd.concat(df_list, ignore_index=True)

print(f"Combined shape: {df.shape}")

Combined shape: (5138529, 80)


In [28]:
df_sample = df.sample(n=100_000, random_state=42).copy()

from sklearn.preprocessing import LabelEncoder

# Extract and encode the target labels
y_raw = df_sample['Label']
le = LabelEncoder()
y = le.fit_transform(y_raw)


In [29]:
# Convert to numeric
df_converted = df_sample.apply(pd.to_numeric, errors='coerce')

# Drop columns with >50% missing values
threshold = 0.5
df_reduced = df_converted.loc[:, df_converted.isnull().mean() < threshold]

# Fill remaining NaNs with column means
df_cleaned = df_reduced.copy()
for col in df_cleaned.columns:
    if df_cleaned[col].isnull().any():
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mean())

print(f"Cleaned shape: {df_cleaned.shape}")

Cleaned shape: (100000, 78)


In [30]:
import numpy as np

# Check for NaNs
print("NaNs:", df_cleaned.isna().sum().sum())

# Check for Infs
print("Infs:",np.isinf(df_cleaned.values).sum())

NaNs: 0
Infs: 652


Since the count is zero we need to clean them up

##### Clean the data

In [31]:
# Replace inf/-inf with NaN
df_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill remaining NaNs with column means
df_cleaned = df_cleaned.apply(lambda col: col.fillna(col.mean()) if col.dtype != 'object' else col)

In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cleaned.select_dtypes(include=[np.number]))


### Interaction Feature

In [33]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

input_dim = X_scaled.shape[1]
encoding_dim = 32  # You can tune this

# Define layers
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='linear')(encoded)

# Build model
autoencoder = Model(inputs=input_layer, outputs=decoded)
encoder = Model(inputs=input_layer, outputs=encoded)

# Compile and train
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=20, batch_size=256, shuffle=True)

# Get compressed features
X_encoded = encoder.predict(X_scaled)


Epoch 1/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.4558
Epoch 2/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.1501
Epoch 3/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0825
Epoch 4/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0465
Epoch 5/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0335
Epoch 6/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0265
Epoch 7/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0241
Epoch 8/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0220
Epoch 9/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0215
Epoch 10/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - lo

In [34]:
import pandas as pd
import numpy as np

# Select numeric columns
numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns

# Compute correlation matrix
corr_matrix = df_cleaned[numeric_cols].corr().abs()

# Get upper triangle of correlation matrix
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find pairs with correlation > 0.7 (tune this threshold as needed)
high_corr_pairs = [(col1, col2) for col1 in upper_tri.columns for col2 in upper_tri.index if upper_tri.loc[col2, col1] > 0.7]

# Create interaction features
for col1, col2 in high_corr_pairs:
    new_col_name = f"{col1}_x_{col2}"
    df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
    
df_cleaned = df_cleaned.copy()

print(f"✅ Created {len(high_corr_pairs)} interaction features.")


  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name] = df_cleaned[col1] * df_cleaned[col2]
  df_cleaned[new_col_name

✅ Created 137 interaction features.


In [35]:
df_cleaned = df_cleaned.copy()


### Train and Extract Embedding

In [36]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# Define input dimension
input_dim = X_scaled.shape[1]
encoding_dim = 32  # Bottleneck size

# Encoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(64, activation='relu')(encoded)
bottleneck = Dense(encoding_dim, activation='relu')(encoded)

# Decoder
decoded = Dense(64, activation='relu')(bottleneck)
decoded = Dense(128, activation='relu')(decoded)
output_layer = Dense(input_dim, activation='linear')(decoded)

# Build models
autoencoder = Model(inputs=input_layer, outputs=output_layer)
encoder = Model(inputs=input_layer, outputs=bottleneck)

# Compile and train
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=20, batch_size=256, shuffle=True, verbose=1)

# Extract embeddings
X_embeddings = encoder.predict(X_scaled)

# Add embeddings to DataFrame
for i in range(encoding_dim):
    df_cleaned[f'embed_{i}'] = X_embeddings[:, i]

print(f"✅ Added {encoding_dim} autoencoder embeddings to your dataset.")

Epoch 1/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 0.2197
Epoch 2/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0515
Epoch 3/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 0.0325
Epoch 4/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0400
Epoch 5/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.0565
Epoch 6/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 0.0442
Epoch 7/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 0.0415
Epoch 8/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - loss: 0.0457
Epoch 9/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.0467
Epoch 10/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - lo