# Understand & Visualize the training set

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
train_path = Path("../datasets/train.csv")

cars = pd.read_csv(train_path)

cars.head()

In [None]:
cars.info()

## Categorical Features

In [None]:
cars.describe(include=['object']).T

### name

In [None]:
# import utils
from pathlib import Path
import sys

sys.path.append(str(Path("..").resolve()))

import utils


In [None]:
utils.draw_bar_graph(cars, "name")

In [None]:
cnt_name = cars['name'].value_counts()

In [None]:
cnt_name[cnt_name >= 10]

In [None]:
# draw box plot for the most popular car names (its frequency>=10)
most_popular_names = cnt_name[cnt_name >= 25].index
subset = cars[cars['name'].isin(most_popular_names)].copy()
subset['name_freq'] = subset['name'].map(cnt_name)
subset = subset.sort_values(by='name_freq', ascending=False).drop(columns='name_freq')
utils.draw_box_plot(subset, "name", "selling_price")

Now let's take the log of the target

In [None]:
import numpy as np

In [None]:
subset['log_selling_price'] = np.log1p(subset['selling_price'])
utils.draw_box_plot(subset, "name", "selling_price")

For 'name' Feature:
- We have 1890 unique values
- min_freq=1 max=100
- 134 value thier frequence >= 10
- 26 value thier frequence >= 25
- name feature has great effect size on the target
- we will need to group rare car names to other
- we will need to use target encoding with the median as the statistics 
- any value its frequency less than 25 -> other

### fuel

In [None]:
utils.draw_bar_graph(cars, "fuel")

In [None]:
cnt_fuel = cars['fuel'].value_counts()
cnt_fuel

In [None]:
subset = cars.copy()
subset['fuel_freq'] = subset['fuel'].map(cnt_fuel)
subset = subset.sort_values(by='fuel_freq', ascending=False).drop(columns='fuel_freq')
utils.draw_box_plot(subset, "fuel", "selling_price")

For fuel feature:
- We have four cats: Diesel, Petrol, CNG, LPG
- the most frequent: Diesel, Petrol
- LPG fuel cars' prices are less than other types of fuels on average
- We will need to one hot encode this feature

### seller_type

In [None]:
utils.draw_bar_graph(cars, "seller_type")

In [None]:
cnt_seller_type = cars['seller_type'].value_counts()
cnt_seller_type

In [None]:
subset = cars.copy()
subset['seller_type_freq'] = subset['seller_type'].map(cnt_seller_type)
subset = subset.sort_values(by='seller_type_freq', ascending=False).drop(columns='seller_type_freq')
utils.draw_box_plot(subset, "seller_type", "selling_price")

Fore seller_type feature:
- We have three cats: Individual, Dealer, Trustmark Dealer
- most freq: Individual
- We will need to one hot encode this feature 

### transmission

In [None]:
utils.draw_bar_graph(cars, "transmission")

In [None]:
cnt_transmission = cars['transmission'].value_counts()
cnt_transmission

In [None]:
subset = cars.copy()
subset['transmission_freq'] = subset['transmission'].map(cnt_transmission)
subset = subset.sort_values(by='transmission_freq', ascending=False).drop(columns='transmission_freq')
utils.draw_box_plot(subset, "transmission", "selling_price")

For transmission feature:
- takes two cats: Manual, Automatic
- Manual is the most freq
- automatic cars seems to be more expensive SO we will use ordinal encoder for this feature

### owner

In [None]:
utils.draw_bar_graph(cars, "owner")

In [None]:
cnt_owner = cars['owner'].value_counts()
cnt_owner

In [None]:
subset = cars.copy()
subset['owner_freq'] = subset['owner'].map(cnt_owner)
subset = subset.sort_values(by='owner_freq', ascending=False).drop(columns='owner_freq')
utils.draw_box_plot(subset, "owner", "selling_price")

For owner feature:
- alright, we have 5 cats: First Owner, Second Owner, Third Owner, Fourth & Above Owner, Test Drive Car
- most freq: First Owner
- We have just 4 rows, with: Test Drive Car, SO we will delete those lines in future
- We will use ordinal encoding for this feature

### mileage

In [None]:
utils.draw_bar_graph(cars, "mileage")

In [None]:
cnt_mileage = cars['mileage'].value_counts()
cnt_mileage

In [None]:
subset = cars.copy()
subset['mileage_freq'] = subset['mileage'].map(cnt_mileage)
# use str.slice(...) (call the method) instead of indexing the method object
subset['mileage_units'] = subset['mileage'].str.slice(6)
subset = subset.sort_values(by='mileage_freq', ascending=False).drop(columns='mileage_freq')
utils.draw_box_plot(subset, "mileage", "selling_price")

In [None]:
subset['mileage_units'].value_counts()

In [None]:
# remove non-numeric suffix from cars['mileage'] and convert to float
# this extracts the numeric part (e.g. "24.0" from "24.0 kmpl") and converts it to numeric,
# preserving NaNs for missing values.
cars['mileage'] = cars['mileage'].astype(str).str.extract(r'([\d\.]+)', expand=False)
cars['mileage'] = pd.to_numeric(cars['mileage'], errors='coerce')

In [None]:
cars.head()

For mileage feature:
- Has many cats
- Can be converted to float feature (numeric) by deleting kmpl
- We will study it as a numeric feature