# 04 – Data Preprocessing (PySpark)

Here we prepare the fused data for machine‑learning.  Using Spark’s ML library we encode categorical variables, assemble features into a single vector, scale numeric columns and perform a train/test split.  We store the resulting pipeline and data splits for subsequent modelling.

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
import os

spark = SparkSession.builder.appName('CTR_Preprocessing').getOrCreate()
processed_dir = os.path.join('..', 'data', 'processed')

# Load fused dataset from previous notebook (we saved as Parquet)
full_df = spark.read.parquet(os.path.join(processed_dir, 'hybrid_fusion.parquet')) if os.path.exists(os.path.join(processed_dir, 'hybrid_fusion.parquet')) else None
if full_df is None:
    # If not saved yet, run EDA notebook to produce hybrid_fusion.parquet or change path accordingly
    raise FileNotFoundError('Hybrid fused dataset not found.  Please run the EDA notebook to generate it.')

# Identify target and feature columns
target_col = 'clk'

# Remove target from features
data = full_df.dropna(subset=[target_col])
feature_cols = [c for c in data.columns if c != target_col]

# Infer categorical and numeric columns
categorical_cols = [c for c, dtype in data.dtypes if dtype == 'string' and c != target_col]
numeric_cols = [c for c, dtype in data.dtypes if dtype != 'string' and c != target_col]

# Index and encode categorical variables
indexers = [StringIndexer(inputCol=c, outputCol=f'{c}_indexed', handleInvalid='keep') for c in categorical_cols]
encoders = [OneHotEncoder(inputCol=f'{c}_indexed', outputCol=f'{c}_ohe') for c in categorical_cols]

# Assemble numeric and encoded categorical features
assembler_inputs = [f'{c}_ohe' for c in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol='raw_features')

# Scale numeric features inside the vector
scaler = StandardScaler(inputCol='raw_features', outputCol='features')

# Create pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler])

# Fit pipeline and transform data
pipeline_model = pipeline.fit(data)
processed_df = pipeline_model.transform(data).select('features', target_col)

# Split data into train and test sets
train_df, test_df = processed_df.randomSplit([0.8, 0.2], seed=42)

# Save processed data and pipeline
processed_df.write.mode('overwrite').parquet(os.path.join(processed_dir, 'processed_for_model.parquet'))
train_df.write.mode('overwrite').parquet(os.path.join(processed_dir, 'train_df.parquet'))
test_df.write.mode('overwrite').parquet(os.path.join(processed_dir, 'test_df.parquet'))

pipeline_model.write().overwrite().save(os.path.join(processed_dir, 'preprocessing_pipeline'))

print('Preprocessing complete – data splits and pipeline saved.')
