In [None]:
# 04 – Feature Engineering

This notebook applies the transformations chosen during EDA to produce a final modeling dataset:
- Brand extraction & cleanup  
- Log‐transforms of skewed features  
- Outlier winsorization  
- Categorical encoding  
- PCA for size/power components  
- Interaction terms  
- Final dataset preview & export  


In [5]:
import os
import numpy as np
import pandas as pd
import warnings
from scipy.stats import mstats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

warnings.filterwarnings("ignore")

# Paths
RAW_CSV    = "../data/raw/CarPrice_Assignment.csv"
OUTPUT_CSV = "../data/processed/car_price_features.csv"

# Ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

# 1) Load raw data
df = pd.read_csv(RAW_CSV)
print("Raw data shape:", df.shape)

# 2) Brand extraction & grouping
df["brand"] = (
    df["CarName"]
      .str.split().str[0]
      .str.lower()
      .replace({
         "maxda":"mazda",
         "porcshce":"porsche",
         "toyouta":"toyota",
         "vokswagen":"volkswagen",
         "vw":"volkswagen"
      })
)
vc = df["brand"].value_counts()
common = vc[vc >= 10].index
df["brand"] = df["brand"].where(df["brand"].isin(common), "other")
df.drop("CarName", axis=1, inplace=True)

# 3) Log-transform skewed features (including target)
skewed_cols = ["price", "enginesize", "horsepower", "compressionratio"]
for col in skewed_cols:
    df["log_" + col] = np.log(df[col])

# 4) Winsorize outliers on key numerics
winsor_cols = ["price", "enginesize", "horsepower", "curbweight"]
for col in winsor_cols:
    df[col] = mstats.winsorize(df[col], limits=[0.01,0.01])

# 5) Categorical encoding
one_hot_cols = ["brand", "fueltype", "aspiration", "carbody", "drivewheel", "enginelocation"]
df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])

# 6) PCA on size/performance features
size_feats = ["wheelbase","carlength","carwidth","carheight","curbweight","enginesize","horsepower"]
scaler     = StandardScaler()
X_scaled   = scaler.fit_transform(df[size_feats])
pca        = PCA(n_components=2, random_state=42)
pcs        = pca.fit_transform(X_scaled)
df["size_pc"], df["power_pc"] = pcs[:,0], pcs[:,1]
print("PCA explained variance ratios:", pca.explained_variance_ratio_)

# 7) Interaction terms
df["hp_x_size"]    = df["horsepower"] * df["enginesize"]
df["weight_x_mpg"] = df["curbweight"] * df["citympg"]

# 8) Final preview & export
print("Final feature matrix shape:", df.shape)
display(df.head())
df.to_csv(OUTPUT_CSV, index=False)
print("Features saved to", OUTPUT_CSV)


Raw data shape: (205, 26)
PCA explained variance ratios: [0.68926905 0.19560028]
Final feature matrix shape: (205, 46)


Unnamed: 0,car_ID,symboling,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,...,carbody_hatchback,carbody_sedan,carbody_wagon,drivewheel_fwd,drivewheel_rwd,enginelocation_rear,size_pc,power_pc,hp_x_size,weight_x_mpg
0,1,3,1,88.6,168.8,64.1,48.8,2548,0,2,...,False,False,False,False,True,False,-1.469742,-2.201764,14430,53508
1,2,3,1,88.6,168.8,64.1,48.8,2548,0,2,...,False,False,False,False,True,False,-1.469742,-2.201764,14430,53508
2,3,1,1,94.5,171.2,65.5,52.4,2823,5,3,...,True,False,False,False,True,False,0.374351,-1.562812,23408,53637
3,4,2,0,99.8,176.6,66.2,54.3,2337,3,2,...,False,True,False,True,False,False,-0.115033,0.456311,11118,56088
4,5,2,0,99.4,176.6,66.4,54.3,2824,3,1,...,False,True,False,False,False,False,0.690276,-0.030998,15640,50832


Features saved to ../data/processed/car_price_features.csv
