# 04 – Data Preprocessing

This notebook prepares the fused data for modelling.  We encode categorical variables, scale numeric features, address class imbalance and split the data into training and test sets.

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load hybrid fused data
processed_dir = os.path.join(os.path.pardir, 'data', 'processed')
full_df = pd.read_csv(os.path.join(processed_dir, 'hybrid_fusion.csv'))

# Define target and features
target = 'clk'
features = full_df.drop(columns=[target])
y = full_df[target]

# Identify categorical and numeric columns
categorical_cols = features.select_dtypes(include=['object']).columns
numeric_cols = features.select_dtypes(include=[np.number]).columns

# Preprocess: one‑hot encode categoricals, scale numerics
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ])

# Address class imbalance using SMOTE after train–test split
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, stratify=y, random_state=42)

print('Before SMOTE class distribution:', y_train.value_counts(normalize=True))

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print('After SMOTE class distribution:', y_train_balanced.value_counts(normalize=True))

# Save preprocessed splits
X_train_balanced.to_csv(os.path.join(processed_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(processed_dir, 'X_test.csv'), index=False)
y_train_balanced.to_csv(os.path.join(processed_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(processed_dir, 'y_test.csv'), index=False)

# Save preprocessor for use in modelling
import joblib
joblib.dump(preprocessor, os.path.join(processed_dir, 'preprocessor.pkl'))

print('Preprocessing complete and files saved.')


Before SMOTE class distribution: clk
0    0.9531
1    0.0469
Name: proportion, dtype: float64


ValueError: could not convert string to float: '2017-05-13 00:51:49'