# Clean the dataset

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
train_path = Path("../datasets/train.csv")

cars = pd.read_csv(train_path)

cars.head()

In [None]:
cars.info()

mileage, engine, max_power, seats has missing values

## mileage

We will convert to numeric feature first, then we use the median for the missing values

In [None]:
cars['mileage_unit'] = cars['mileage'].str.split().str[-1]
cars['mileage'] = cars['mileage'].astype(str).str.extract(r'([\d\.]+)', expand=False)
cars['mileage'] = pd.to_numeric(cars['mileage'], errors='coerce')

mask_petrol = (cars['mileage_unit'] == "km/kg") & (cars['fuel'] == "Petrol")
mask_diesel = (cars['mileage_unit'] == "km/kg") & (cars['fuel'] == "Diesel")
mask_cng = (cars['mileage_unit'] == "km/kg") & (cars['fuel'] == "CNG")
mask_lpg = (cars['mileage_unit'] == "km/kg") & (cars['fuel'] == "LPG")

cars.loc[mask_petrol, 'mileage'] /= 0.74
cars.loc[mask_diesel, 'mileage'] /= 0.832
cars.loc[mask_lpg, 'mileage'] /=   0.54
cars.loc[mask_cng, 'mileage'] /=   0.128

cars.drop(columns=['mileage_unit'], inplace=True)

cars.head()

In [None]:
from sklearn.impute import SimpleImputer

median_imputer = SimpleImputer(strategy='median')

In [None]:
cars[['mileage']] = median_imputer.fit_transform(cars[['mileage']])

cars.info()

## engine

We will convert to numeric feature first, then we use the median for the missing values

In [None]:
cars['engine'] = cars['engine'].str.split().str[0]
cars['engine'] = pd.to_numeric(cars['engine'], errors='coerce')

In [None]:
cars[['engine']] = median_imputer.fit_transform(cars[['engine']])

cars.info()

## max_power

We will convert to numeric feature first, then we use the median for the missing values

In [None]:
cars['max_power'] = cars['max_power'].str.split().str[0]
cars['max_power'] = pd.to_numeric(cars['max_power'], errors='coerce')

In [None]:
cars[['max_power']] = median_imputer.fit_transform(cars[['max_power']])

cars.info()

## seats

In [None]:
cars.seats.value_counts()

In [None]:
cars[['seats']] = median_imputer.fit_transform(cars[['seats']])

cars.info()

# Drop torque

In [None]:
cars.drop(columns=['torque'], inplace=True)

cars.head()

In [None]:
cars.info()

In [46]:
cleaned_path = Path("../datasets/cleaned.csv")

cars.to_csv(cleaned_path, index=False)