# ðŸ““  05_data_transformation.ipynb (Scaling & Encoding)

In [1]:
# Imports & Settings
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

pd.set_option('display.max_columns', 100)


In [2]:
# Load final feature-engineered data
df = pd.read_csv('../data/processed/fraud_data_final_features.csv')

print(f"ðŸ“Š Dataset shape: {df.shape}")
df.head()


ðŸ“Š Dataset shape: (129146, 22)


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,source,browser,sex,age,class,purchase_hour,purchase_day,is_high_risk_country,time_since_signup_hours,purchase_day_of_week,new_customer_1hr,new_customer_24hr,new_customer_7days,high_value_purchase,very_high_value_purchase,device_frequency,unique_users_per_device,suspicious_device
0,247547,2015-06-28 03:00:34,2015-08-09 03:57:29,47,SEO,Safari,F,30,0,3,Sunday,0,1008.948611,6,0,0,0,0,0,1,1,0
1,220737,2015-01-28 14:21:11,2015-02-11 20:28:28,15,SEO,Chrome,F,34,0,20,Wednesday,0,342.121389,2,0,0,0,0,0,1,1,0
2,390400,2015-03-19 20:49:09,2015-04-11 23:41:23,44,Ads,IE,M,29,0,23,Saturday,0,554.870556,5,0,0,0,0,0,2,2,0
3,69592,2015-02-24 06:11:57,2015-05-23 16:40:14,55,Direct,Chrome,F,30,0,16,Saturday,0,2122.471389,5,0,0,0,1,0,1,1,0
4,174987,2015-07-07 12:58:11,2015-11-03 04:04:30,51,SEO,Chrome,F,37,0,4,Tuesday,0,2847.105278,1,0,0,0,1,0,1,1,0


In [4]:
# Define Target and Drop Non-Model Columns
# Target
target = 'class'

# Columns not used for modeling
non_feature_cols = [
    'class',
    'user_id',
    'signup_time',
    'purchase_time'
]

X = df.drop(columns=non_feature_cols, errors='ignore')
y = df[target]

print(f"ðŸŽ¯ Features shape: {X.shape}")
print(f"ðŸŽ¯ Target distribution:\n{y.value_counts(normalize=True)}")


ðŸŽ¯ Features shape: (129146, 18)
ðŸŽ¯ Target distribution:
class
0    0.905007
1    0.094993
Name: proportion, dtype: float64


In [5]:
# Identify Feature Types
# Categorical features
categorical_features = ['source', 'browser', 'sex']

# Binary features (already 0/1, do NOT scale)
binary_features = [
    'new_customer_1hr',
    'new_customer_24hr',
    'new_customer_7days',
    'high_value_purchase',
    'very_high_value_purchase',
    'suspicious_device',
    'is_high_risk_country'
]

# Numerical features (to be scaled)
numerical_features = [
    'purchase_value',
    'age',
    'purchase_hour',
    'purchase_day_of_week',
    'time_since_signup_hours',
    'device_frequency',
    'unique_users_per_device'
]

print("ðŸ“Œ Feature groups defined")


ðŸ“Œ Feature groups defined


In [6]:
# Train / Test Split (Stratified)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("âœ… Train-test split completed")
print(f"Train fraud rate: {y_train.mean():.4%}")
print(f"Test fraud rate:  {y_test.mean():.4%}")


âœ… Train-test split completed
Train fraud rate: 9.4990%
Test fraud rate:  9.5006%


In [7]:
# Define Preprocessing Pipelines
# Numerical scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Categorical encoder
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(
        handle_unknown='ignore',
        drop='first',   # avoid dummy variable trap
        sparse_output=False
    ))
])

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', 'passthrough', binary_features)
    ]
)

print("âœ… Preprocessing pipeline created")


âœ… Preprocessing pipeline created


In [8]:
# Apply Transformations

print("ðŸ”§ Applying transformations...")

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("âœ… Transformation completed")
print(f"Processed train shape: {X_train_processed.shape}")
print(f"Processed test shape:  {X_test_processed.shape}")


ðŸ”§ Applying transformations...
âœ… Transformation completed
Processed train shape: (103316, 21)
Processed test shape:  (25830, 21)


In [9]:
# Get Final Feature Names (VERY IMPORTANT)

# Get feature names
num_features = numerical_features

cat_features = (
    preprocessor
    .named_transformers_['cat']
    .named_steps['onehot']
    .get_feature_names_out(categorical_features)
)

final_feature_names = (
    num_features +
    list(cat_features) +
    binary_features
)

print(f"ðŸŽ¯ Total final features: {len(final_feature_names)}")
final_feature_names


ðŸŽ¯ Total final features: 21


['purchase_value',
 'age',
 'purchase_hour',
 'purchase_day_of_week',
 'time_since_signup_hours',
 'device_frequency',
 'unique_users_per_device',
 'source_Direct',
 'source_SEO',
 'browser_FireFox',
 'browser_IE',
 'browser_Opera',
 'browser_Safari',
 'sex_M',
 'new_customer_1hr',
 'new_customer_24hr',
 'new_customer_7days',
 'high_value_purchase',
 'very_high_value_purchase',
 'suspicious_device',
 'is_high_risk_country']

In [10]:
# Sanity Checks (Critical)

assert X_train_processed.shape[1] == len(final_feature_names), \
    "Feature count mismatch!"

print("âœ… Feature name alignment confirmed")


âœ… Feature name alignment confirmed


In [11]:
# Save Transformed Data for Modeling
print("ðŸ’¾ Saving transformed datasets...")

np.save('../data/processed/X_train.npy', X_train_processed)
np.save('../data/processed/X_test.npy', X_test_processed)
np.save('../data/processed/y_train.npy', y_train.values)
np.save('../data/processed/y_test.npy', y_test.values)

# Save feature names
pd.Series(final_feature_names).to_csv(
    '../data/processed/final_feature_names.csv',
    index=False
)

print("âœ… All transformed data saved")


ðŸ’¾ Saving transformed datasets...
âœ… All transformed data saved
