# Feature Engineering

## Handling Categorical Features

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
cleaned_path = Path("../datasets/cleaned.csv")

cars = pd.read_csv(cleaned_path)

cars.head()

In [None]:
cars.describe(include='object')

### name

- We will use target encoding with smoothing to avoid overfitting
- We will fit the encoder inside the CV in future

In [None]:
cnt_name = cars['name'].value_counts()
cnt_name

In [None]:
from category_encoders import TargetEncoder


encoder = TargetEncoder(
    cols=["name"],
    smoothing=10,
    handle_unknown="value",
    handle_missing="value"
)

In [None]:
cars = encoder.fit_transform(cars, cars['selling_price'])
cars.head()

In [None]:
# change the name of the column 'name' to 'name_effect' for the cars dataframe
cars.rename(columns={'name': 'name_effect'}, inplace=True)
cars.head()

In [None]:
cars['name_effect'].hist(bins=200)

### fuel

We will one-hot encode it

In [None]:
cnt_fuel = cars['fuel'].value_counts()
cnt_fuel

replace 'CNG' and 'LPG' with 'other'

In [None]:
cars['fuel'] = cars['fuel'].replace({
    'CNG': 'other',
    'LPG': 'other'
})
cars['fuel'].value_counts()

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output=False)

In [None]:
fuel_output = pd.DataFrame(
    one_hot_encoder.fit_transform(cars[['fuel']]),
    columns=one_hot_encoder.get_feature_names_out(),
    index=cars.index
)

cars = pd.concat([cars, fuel_output], axis=1)
cars.head()

In [None]:
cars.drop(columns=['fuel'], inplace=True)
cars.head()

### seller_type

We will one-hot encode it

In [None]:
cnt_seller_type = cars['seller_type'].value_counts()
cnt_seller_type

In [None]:
seller_type_output = pd.DataFrame(
    one_hot_encoder.fit_transform(cars[['seller_type']]),
    columns=one_hot_encoder.get_feature_names_out(),
    index=cars.index
)

cars = pd.concat([cars, seller_type_output], axis=1)
cars.head()

In [None]:
cars.drop(columns=['seller_type'], inplace=True)
cars.head()

### transmission

We will also use onehot encoding

In [None]:
cnt_transmission = cars['transmission'].value_counts()
cnt_transmission

In [None]:
transmission_output = pd.DataFrame(
    one_hot_encoder.fit_transform(cars[['transmission']]),
    columns=one_hot_encoder.get_feature_names_out(),
    index=cars.index
)

cars = pd.concat([cars, transmission_output], axis=1)
cars.head()

In [None]:
cars.drop(columns=['transmission'], inplace=True)
cars.head()

### owner

let's see if the order important

In [None]:
cnt_owner = cars['owner'].value_counts()
cnt_owner

replace 'Third Owner' and 'Fourth & Above Owner' with 'Third & Above Owner' and drop rows with 'Test Drive Car'

In [None]:
cars['owner'] = cars['owner'].replace({
    'Third Owner': 'Third & Above Owner',
    'Fourth & Above Owner': 'Third & Above Owner'
})

cars = cars[cars['owner'] != 'Test Drive Car'].reset_index(drop=True)

cars['owner'].value_counts()

In [None]:
first_owner_level = cars[cars['owner'] == "First Owner"]
second_owner_level = cars[cars['owner'] == "Second Owner"]
third_owner_level = cars[cars['owner'] == "Third & Above Owner"]

def print_mean_pct(name, df):
	if len(df) == 0:
		print(f"With owner: {name} => No data available")
		return
	mean_score = df['selling_price'].mean()
	if pd.isna(mean_score):
		print(f"With owner: {name} => No data available")
	else:
		print(f"With owner: {name} => {mean_score:.2f} is the average of prices")

print_mean_pct("First Owner", first_owner_level)
print_mean_pct("Second Owner", second_owner_level)
print_mean_pct("Third Owner", third_owner_level)

we can see that: Third & Above < Second < First

In [None]:
from sklearn.preprocessing import OrdinalEncoder


ordinal_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    categories=[["Third & Above Owner", "Second Owner", "First Owner"]]
)

In [None]:
cars[['owner']] = ordinal_encoder.fit_transform(cars[['owner']])

cars.head()

In [None]:
cars.shape

## Handling Numerical Features

In [None]:
cars.info()

### year

In [None]:
cars['year'].hist(bins=50)

In [None]:
cars['age'] = 2026 - cars['year']
cars.head()

In [None]:
cars['age'].hist(bins=50)

In [None]:
import numpy as np

cars['age'] = np.log1p(cars['age'])

In [None]:
cars['age'].hist(bins=50)

In [None]:
cars.drop(columns=['year'], inplace=True)

In [None]:
cars.head()

### engine

In [None]:
cars['engine'].hist(bins=50)

In [None]:
cars['engine'] = np.log(cars['engine'])

### mileage

In [None]:
cars['mileage'].hist(bins=50)

In [None]:
cars['mileage'] = np.log1p(cars['mileage'])

### max_power

In [None]:
cars['max_power'].hist(bins=50)

In [None]:
cars['max_power'] = np.log1p(cars['max_power'])

### km_driven

In [None]:
cars['km_driven'].hist(bins=50)

In [None]:
cars['km_driven'] = np.log(cars['km_driven'])

### seats

In [None]:
cars['seats'].hist(bins=50)

In [None]:
summary_table = cars.groupby('seats')['selling_price'].agg(['mean', 'median', 'std', 'count'])
summary_table

In [None]:
# group seats into three categories
conditions = [
    cars['seats'] < 5,
    cars['seats'] == 5,
    cars['seats'] > 5
]
choices = ['less_than_five', 'five', 'more_than_five']
# ensure default has the same dtype (string) as choices to avoid dtype promotion errors
cars['seats'] = np.select(conditions, choices, default='missing')
cars['seats'] = cars['seats'].astype('category')

cars['seats'].value_counts()

In [None]:
seats_output = pd.DataFrame(
    one_hot_encoder.fit_transform(cars[['seats']]),
    columns=one_hot_encoder.get_feature_names_out(),
    index=cars.index
)

cars = pd.concat([cars, seats_output], axis=1)
cars.head()

In [None]:
cars.drop(columns=['seats'], inplace=True)
cars.tail()

### selling_price

In [None]:
cars['selling_price'] = np.log1p(cars['selling_price'])

we will drop rows with outliers

In [None]:
# remove outliers in the target (selling_price) using the IQR rule
n_before = len(cars)

Q1 = cars['selling_price'].quantile(0.25)
Q3 = cars['selling_price'].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

mask = cars['selling_price'].between(lower, upper)
n_out = (~mask).sum()
print(f'Removing {n_out} outlier rows ({n_out/n_before:.2%})')

# overwrite cars with the filtered dataframe
cars = cars[mask].reset_index(drop=True)

In [None]:
cars.head()

## Trying New Combination of features

In [None]:
corr_matrix = cars.corr()

In [None]:
corr_matrix['selling_price'].sort_values(ascending=False)

In [None]:
cars_cp = cars.copy()

cars_cp['km_driven_by_age'] = cars_cp['km_driven'] * cars_cp['age']

corr_matrix = cars_cp.corr()
corr_matrix['selling_price'].sort_values(ascending=False)

In [None]:
corr_matrix['age'].sort_values(ascending=False)

In [None]:
cars_cp["mileage_per_year"] = cars_cp["mileage"] / (cars_cp["age"] + 1)
corr_matrix = cars_cp.corr()
corr_matrix['selling_price'].sort_values(ascending=False)

In [None]:
corr_matrix['engine'].sort_values(ascending=False)

In [None]:
cars_cp['engine_max_power'] = cars_cp['engine'] * cars_cp['max_power']

corr_matrix = cars_cp.corr()
corr_matrix['selling_price'].sort_values(ascending=False)

In [None]:
corr_matrix['mileage'].sort_values(ascending=False)

In [None]:
cars_cp["high_mileage"] = (cars_cp["mileage"] > cars_cp["mileage"].median()).astype(int)
corr_matrix = cars_cp.corr()
corr_matrix['selling_price'].sort_values(ascending=False)

In [None]:
cars_cp['engine_mileage_interaction'] = cars_cp['engine'] / cars_cp['mileage']

corr_matrix = cars_cp.corr()
corr_matrix['selling_price'].sort_values(ascending=False)

In [None]:
cars_cp['owner_per_year'] = cars_cp['owner'] / cars_cp['age']
corr_matrix = cars_cp.corr()
corr_matrix['selling_price'].sort_values(ascending=False)

### We will add the following features:
- cars_cp['engine_mileage_interaction'] = cars_cp['engine'] / cars_cp['mileage']
- cars_cp["mileage_per_year"] = cars_cp["mileage"] / (cars_cp["age"] + 1)
- cars_cp['km_driven_by_age'] = cars_cp['km_driven'] * cars_cp['age']
- cars_cp['owner_per_year'] = cars_cp['owner'] / cars_cp['age']

In [None]:
cars['engine_mileage_interaction'] = cars['engine'] / cars['mileage']
cars["mileage_per_year"] = cars["mileage"] / (cars["age"] + 1)
cars['km_driven_by_age'] = cars['km_driven'] * cars['age']
cars['owner_per_year'] = cars['owner'] / cars['age']

In [None]:
cars.head()

In [None]:
cars.info()

In [None]:
preprocessed_path = Path("../datasets/preprocessed.csv")

cars.to_csv(preprocessed_path, index=False)