# Feature Engineering

## Handling Categorical Features

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
cleaned_path = Path("../datasets/cleaned.csv")

cars = pd.read_csv(cleaned_path)

cars.head()

In [None]:
cars.describe(include='object')

### name

- We will use target encoding with smoothing to avoid overfitting
- We will fit the encoder inside the CV in future

In [None]:
cnt_name = cars['name'].value_counts()
cnt_name

In [None]:
from category_encoders import TargetEncoder


encoder = TargetEncoder(
    cols=["name"],
    smoothing=10,
    handle_unknown="value",
    handle_missing="value"
)

In [None]:
cars = encoder.fit_transform(cars, cars['selling_price'])
cars.head()

In [None]:
# change the name of the column 'name' to 'name_effect' for the cars dataframe
cars.rename(columns={'name': 'name_effect'}, inplace=True)
cars.head()

### fuel

We will one-hot encode it

In [None]:
cnt_fuel = cars['fuel'].value_counts()
cnt_fuel

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output=False)

In [None]:
fuel_output = pd.DataFrame(
    one_hot_encoder.fit_transform(cars[['fuel']]),
    columns=one_hot_encoder.get_feature_names_out(),
    index=cars.index
)

cars = pd.concat([cars, fuel_output], axis=1)
cars.head()

In [None]:
cars.drop(columns=['fuel'], inplace=True)
cars.head()

### seller_type

We will one-hot encode it

In [None]:
cnt_seller_type = cars['seller_type'].value_counts()
cnt_seller_type

In [None]:
seller_type_output = pd.DataFrame(
    one_hot_encoder.fit_transform(cars[['seller_type']]),
    columns=one_hot_encoder.get_feature_names_out(),
    index=cars.index
)

cars = pd.concat([cars, seller_type_output], axis=1)
cars.head()

In [None]:
cars.drop(columns=['seller_type'], inplace=True)
cars.head()

### transmission

We will also use onehot encoding

In [None]:
cnt_transmission = cars['transmission'].value_counts()
cnt_transmission

In [None]:
transmission_output = pd.DataFrame(
    one_hot_encoder.fit_transform(cars[['transmission']]),
    columns=one_hot_encoder.get_feature_names_out(),
    index=cars.index
)

cars = pd.concat([cars, transmission_output], axis=1)
cars.head()

In [None]:
cars.drop(columns=['transmission'], inplace=True)
cars.head()

### owner

let's see if the order important

In [None]:
cnt_owner = cars['owner'].value_counts()
cnt_owner

In [None]:
test_car_level = cars[~cars['owner'].isin(("First Owner", "Second Owner", "Third Owner", "Fourth & Above Owner"))]
first_owner_level = cars[cars['owner'] == "First Owner"]
second_owner_level = cars[cars['owner'] == "Second Owner"]
third_owner_level = cars[cars['owner'] == "Third Owner"]
fourth_and_above_owner_level = cars[cars['owner'] == "Fourth & Above Owner"]

def print_mean_pct(name, df):
	if len(df) == 0:
		print(f"With owner: {name} => No data available")
		return
	mean_score = df['selling_price'].mean()
	if pd.isna(mean_score):
		print(f"With owner: {name} => No data available")
	else:
		print(f"With owner: {name} => {mean_score:.2f} is the average of prices")

print_mean_pct("test car", test_car_level)
print_mean_pct("First Owner", first_owner_level)
print_mean_pct("Second Owner", second_owner_level)
print_mean_pct("Third Owner", third_owner_level)
print_mean_pct("Fourth & above Owner", fourth_and_above_owner_level)


we can see that: Fourth & Above < Third < Second < First < test car

In [None]:
from sklearn.preprocessing import  OrdinalEncoder


ordinal_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    categories=[["Fourth & Above Owner", "Third Owner", "Second Owner", "First Owner", "Test Drive Car"]]
)

In [None]:
cars[['owner']] = ordinal_encoder.fit_transform(cars[['owner']])

cars.head()

In [None]:
cars.shape