# Understand & Visualize the training set

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
train_path = Path("../datasets/train.csv")

cars = pd.read_csv(train_path)

cars.head()

In [None]:
cars.info()

## Categorical Features

In [None]:
cars.describe(include=['object']).T

### name

In [None]:
# import utils
from pathlib import Path
import sys

sys.path.append(str(Path("..").resolve()))

import utils


In [None]:
utils.draw_bar_graph(cars, "name")

In [None]:
cnt_name = cars['name'].value_counts()

In [None]:
cnt_name[cnt_name >= 10]

In [None]:
# draw box plot for the most popular car names (its frequency>=10)
most_popular_names = cnt_name[cnt_name >= 25].index
subset = cars[cars['name'].isin(most_popular_names)].copy()
subset['name_freq'] = subset['name'].map(cnt_name)
subset = subset.sort_values(by='name_freq', ascending=False).drop(columns='name_freq')
utils.draw_box_plot(subset, "name", "selling_price")

Now let's take the log of the target

In [None]:
import numpy as np

In [None]:
subset['log_selling_price'] = np.log1p(subset['selling_price'])
utils.draw_box_plot(subset, "name", "selling_price")

For 'name' Feature:
- We have 1890 unique values
- min_freq=1 max=100
- 134 value thier frequence >= 10
- 26 value thier frequence >= 25
- name feature has great effect size on the target
- we will need to group rare car names to other
- we will need to use target encoding with the median as the statistics 
- any value its frequency less than 25 -> other

### fuel

In [None]:
utils.draw_bar_graph(cars, "fuel")

In [None]:
cnt_fuel = cars['fuel'].value_counts()
cnt_fuel

In [None]:
subset = cars.copy()
subset['fuel_freq'] = subset['fuel'].map(cnt_fuel)
subset = subset.sort_values(by='fuel_freq', ascending=False).drop(columns='fuel_freq')
utils.draw_box_plot(subset, "fuel", "selling_price")

For fuel feature:
- We have four cats: Diesel, Petrol, CNG, LPG
- the most frequent: Diesel, Petrol
- LPG fuel cars' prices are less than other types of fuels on average
- We will need to one hot encode this feature

### seller_type

In [None]:
utils.draw_bar_graph(cars, "seller_type")

In [None]:
cnt_seller_type = cars['seller_type'].value_counts()
cnt_seller_type

In [None]:
subset = cars.copy()
subset['seller_type_freq'] = subset['seller_type'].map(cnt_seller_type)
subset = subset.sort_values(by='seller_type_freq', ascending=False).drop(columns='seller_type_freq')
utils.draw_box_plot(subset, "seller_type", "selling_price")

Fore seller_type feature:
- We have three cats: Individual, Dealer, Trustmark Dealer
- most freq: Individual
- We will need to one hot encode this feature 

### transmission

In [None]:
utils.draw_bar_graph(cars, "transmission")

In [None]:
cnt_transmission = cars['transmission'].value_counts()
cnt_transmission

In [None]:
subset = cars.copy()
subset['transmission_freq'] = subset['transmission'].map(cnt_transmission)
subset = subset.sort_values(by='transmission_freq', ascending=False).drop(columns='transmission_freq')
utils.draw_box_plot(subset, "transmission", "selling_price")

For transmission feature:
- takes two cats: Manual, Automatic
- Manual is the most freq
- automatic cars seems to be more expensive SO we will use ordinal encoder for this feature

### owner

In [None]:
utils.draw_bar_graph(cars, "owner")

In [None]:
cnt_owner = cars['owner'].value_counts()
cnt_owner

In [None]:
subset = cars.copy()
subset['owner_freq'] = subset['owner'].map(cnt_owner)
subset = subset.sort_values(by='owner_freq', ascending=False).drop(columns='owner_freq')
utils.draw_box_plot(subset, "owner", "selling_price")

For owner feature:
- alright, we have 5 cats: First Owner, Second Owner, Third Owner, Fourth & Above Owner, Test Drive Car
- most freq: First Owner
- We have just 4 rows, with: Test Drive Car, SO we will delete those lines in future
- We will use ordinal encoding for this feature

### mileage

In [None]:
utils.draw_bar_graph(cars, "mileage")

In [None]:
cnt_mileage = cars['mileage'].value_counts()
cnt_mileage

In [None]:
subset = cars.copy()
subset['mileage_freq'] = subset['mileage'].map(cnt_mileage)
subset = subset.sort_values(by='mileage_freq', ascending=False).drop(columns='mileage_freq')
utils.draw_box_plot(subset, "mileage", "selling_price")

In [None]:
subset['mileage_units'] = subset['mileage'].str.split().str[-1]
subset['mileage_units'].value_counts()

In [None]:
cars['mileage_unit'] = subset['mileage_units']

In [None]:
# remove non-numeric suffix from cars['mileage'] and convert to float
# this extracts the numeric part (e.g. "24.0" from "24.0 kmpl") and converts it to numeric,
# preserving NaNs for missing values.
cars['mileage'] = cars['mileage'].astype(str).str.extract(r'([\d\.]+)', expand=False)
cars['mileage'] = pd.to_numeric(cars['mileage'], errors='coerce')

In [None]:
cars.head()

In [None]:
mask_petrol = (cars['mileage_unit'] == "km/kg") & (cars['fuel'] == "Petrol")
mask_diesel = (cars['mileage_unit'] == "km/kg") & (cars['fuel'] == "Diesel")
mask_cng = (cars['mileage_unit'] == "km/kg") & (cars['fuel'] == "CNG")
mask_lpg = (cars['mileage_unit'] == "km/kg") & (cars['fuel'] == "LPG")


cars.loc[mask_petrol, 'mileage'] /= 0.74
cars.loc[mask_diesel, 'mileage'] /= 0.832
cars.loc[mask_lpg, 'mileage'] /=   0.54
cars.loc[mask_cng, 'mileage'] /=   0.128


In [None]:
cars.drop(columns=['mileage_unit'], inplace=True)

In [None]:
cars.tail()

In [None]:
cars.describe()

For mileage feature:
- Has many cats
- Can be converted to float feature (numeric) by deleting the unit
- We will study it as a numeric feature
- there are two kinds of units associated with these feature: kmpl, kg/km we will need to convert kg/km to kmpl based on the fuel type

### engine

In [None]:
utils.draw_bar_graph(cars, "engine")

In [None]:
cnt_engine = cars['engine'].value_counts()
cnt_engine

In [None]:
subset = cars.copy()
subset['engine_freq'] = subset['engine'].map(cnt_engine)

subset = subset.sort_values(by='engine_freq', ascending=False).drop(columns='engine_freq')
utils.draw_box_plot(subset, "engine", "selling_price")

In [None]:
subset['engine_units'] = subset['engine'].str.split().str[-1]
subset['engine_units'].value_counts()

In [None]:
cars['engine'] = cars['engine'].str.split().str[0]
cars.head()

In [None]:
cars.info()

In [None]:
cars['engine'] = pd.to_numeric(cars['engine'], errors='coerce')
cars.info()

For engine feature:
- has many cats
- we wil delete the 'CC' unit and convert this feature to numeric

### max_power

In [None]:
utils.draw_bar_graph(cars, "max_power")

In [None]:
cnt_max_power = cars['max_power'].value_counts()
cnt_max_power

In [None]:
subset = cars.copy()
subset['max_power_freq'] = subset['max_power'].map(cnt_max_power)

subset = subset.sort_values(by='max_power_freq', ascending=False).drop(columns='max_power_freq')
utils.draw_box_plot(subset, "max_power", "selling_price")

In [None]:
subset['max_power_units'] = subset['max_power'].str.rsplit(n=1).str[-1]
subset['max_power_units'].value_counts()

In [None]:
cars['max_power'] = cars['max_power'].str.split().str[0]
cars.head()

In [None]:
cars['max_power'] = pd.to_numeric(cars['max_power'], errors='coerce')
cars.info()

For max_power feature:
- has many cats
- has one unit bhb SO we can converted to numeric

### torque

In [None]:
utils.draw_bar_graph(cars, "torque")

In [None]:
cnt_torque = cars['torque'].value_counts()
cnt_torque

In [None]:
subset = cars.copy()
subset['torque_freq'] = subset['torque'].map(cnt_torque)

subset = subset.sort_values(by='torque_freq', ascending=False).drop(columns='torque_freq')
utils.draw_box_plot(subset, "torque", "selling_price")

For torque feature:
- it has many cats and it seems that has many unit
- we will drop it, because it is not very important and the data is has many shapes

## Numeric Features

In [None]:
cars.describe()

In [None]:
corr_matrix = cars.corr(numeric_only=True)
corr_matrix 

### selling_price

In [None]:
cars['selling_price'].hist(bins=50, figsize=(15, 12))

In [None]:
corr_matrix['selling_price'].sort_values(ascending=False)

For the label:
- max_power, engine, year, km_driven are the most correlated with it
- we will need to make log transformation to it because it's right skewed

### year

In [None]:
cars['year'].hist(bins=50, figsize=(15, 12))

In [None]:
corr_matrix['year'].sort_values(ascending=False)

For year feature:
- it's left skewed
- we will need to reflect with some constant number c=3000, then use log transformation for the y* = c - y, because y* will be right skewed

### max_power

In [None]:
cars['max_power'].hist(bins=50, figsize=(15, 12))

In [None]:
corr_matrix['max_power'].sort_values(ascending=False)

For max_power feature:
- we will need to take the log because it's righ skewed

### engine

In [None]:
cars['engine'].hist(bins=50, figsize=(15, 12))

In [None]:
corr_matrix['engine'].sort_values(ascending=False)

For engine feature:
- we will need to use standarize transformation

### mileage

In [None]:
cars['mileage'].hist(bins=50, figsize=(15, 12))

In [None]:
log_mileage = pd.Series(np.log1p(cars['mileage']))
log_mileage.hist(bins=50, figsize=(15,12))

In [None]:
corr_matrix['mileage'].sort_values(ascending=False)

For the mileage feature:
- we will use log transformation
- we will try some feature combination between it and the engine feature

### km_driven

In [None]:
cars['km_driven'].hist(bins=50, figsize=(15, 12))

In [None]:
log_km_driven = pd.Series(np.sqrt(cars['km_driven']))
log_km_driven.hist(bins=50, figsize=(15,12))

In [None]:
corr_matrix['km_driven'].sort_values(ascending=False)

For km_driven feature:
- we will use the sqrt transformation
- we will try some combination between it and between the year feature

### seats

In [None]:
cars['seats'].hist(bins=50, figsize=(15, 12))

for seats feature:
- takes those numbers: 4, 5, 6, 7, 8, 9, 10
- the most freq is 5
- we will need to group them then one hot encode them

In [None]:
cars.info()