In [6]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("data/Toyota.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [7]:
# a. Data Cleaning – remove duplicates, replace '??', convert types, fill missing
df = df.drop_duplicates()
df = df.replace('??', np.nan)

In [8]:
df['KM'] = df['KM'].str.replace(',', '')
df['KM'] = pd.to_numeric(df['KM'], errors='coerce')
df['HP'] = pd.to_numeric(df['HP'], errors='coerce')
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['CC'] = pd.to_numeric(df['CC'], errors='coerce')

In [9]:
df = df.fillna(df.mean(numeric_only=True))
df  # 🔹 After Data Cleaning

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.000000,46986.000000,Diesel,90.0,1.000000,0,2000,three,1165
1,1,13750,23.000000,72937.000000,Diesel,90.0,1.000000,0,2000,3,1165
2,2,13950,24.000000,41711.000000,Diesel,90.0,0.674961,0,2000,3,1165
3,3,14950,26.000000,48000.000000,Diesel,90.0,0.000000,0,2000,3,1165
4,4,13750,30.000000,38500.000000,Diesel,90.0,0.000000,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1431,1431,7500,55.672156,20544.000000,Petrol,86.0,1.000000,0,1300,3,1025
1432,1432,10845,72.000000,68647.239972,Petrol,86.0,0.000000,0,1300,3,1015
1433,1433,8500,55.672156,17016.000000,Petrol,86.0,0.000000,0,1300,3,1015
1434,1434,7250,70.000000,68647.239972,,86.0,1.000000,0,1300,3,1015


In [10]:
# b. Data Integration – merge with emission standard info
emission_data = pd.DataFrame({
    'FuelType': ['Petrol', 'Diesel', 'CNG'],
    'EmissionStandard': ['Euro 4', 'Euro 5', 'Euro 6']
})
df = pd.merge(df, emission_data, on='FuelType', how='left')
df  # 🔹 After Data Integration

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight,EmissionStandard
0,0,13500,23.000000,46986.000000,Diesel,90.0,1.000000,0,2000,three,1165,Euro 5
1,1,13750,23.000000,72937.000000,Diesel,90.0,1.000000,0,2000,3,1165,Euro 5
2,2,13950,24.000000,41711.000000,Diesel,90.0,0.674961,0,2000,3,1165,Euro 5
3,3,14950,26.000000,48000.000000,Diesel,90.0,0.000000,0,2000,3,1165,Euro 5
4,4,13750,30.000000,38500.000000,Diesel,90.0,0.000000,0,2000,3,1170,Euro 5
...,...,...,...,...,...,...,...,...,...,...,...,...
1431,1431,7500,55.672156,20544.000000,Petrol,86.0,1.000000,0,1300,3,1025,Euro 4
1432,1432,10845,72.000000,68647.239972,Petrol,86.0,0.000000,0,1300,3,1015,Euro 4
1433,1433,8500,55.672156,17016.000000,Petrol,86.0,0.000000,0,1300,3,1015,Euro 4
1434,1434,7250,70.000000,68647.239972,,86.0,1.000000,0,1300,3,1015,


In [11]:
# c. Data Transformation – convert KM to thousands
df['KM'] = df['KM'] / 1000
df  # 🔹 After Data Transformation

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight,EmissionStandard
0,0,13500,23.000000,46.98600,Diesel,90.0,1.000000,0,2000,three,1165,Euro 5
1,1,13750,23.000000,72.93700,Diesel,90.0,1.000000,0,2000,3,1165,Euro 5
2,2,13950,24.000000,41.71100,Diesel,90.0,0.674961,0,2000,3,1165,Euro 5
3,3,14950,26.000000,48.00000,Diesel,90.0,0.000000,0,2000,3,1165,Euro 5
4,4,13750,30.000000,38.50000,Diesel,90.0,0.000000,0,2000,3,1170,Euro 5
...,...,...,...,...,...,...,...,...,...,...,...,...
1431,1431,7500,55.672156,20.54400,Petrol,86.0,1.000000,0,1300,3,1025,Euro 4
1432,1432,10845,72.000000,68.64724,Petrol,86.0,0.000000,0,1300,3,1015,Euro 4
1433,1433,8500,55.672156,17.01600,Petrol,86.0,0.000000,0,1300,3,1015,Euro 4
1434,1434,7250,70.000000,68.64724,,86.0,1.000000,0,1300,3,1015,


In [12]:

# d. Error Correcting – remove invalid Price or unrealistic HP
df = df[df['Price'] > 0]
df = df[df['HP'] < 300]
df  # 🔹 After Error Correction

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight,EmissionStandard
0,0,13500,23.000000,46.98600,Diesel,90.0,1.000000,0,2000,three,1165,Euro 5
1,1,13750,23.000000,72.93700,Diesel,90.0,1.000000,0,2000,3,1165,Euro 5
2,2,13950,24.000000,41.71100,Diesel,90.0,0.674961,0,2000,3,1165,Euro 5
3,3,14950,26.000000,48.00000,Diesel,90.0,0.000000,0,2000,3,1165,Euro 5
4,4,13750,30.000000,38.50000,Diesel,90.0,0.000000,0,2000,3,1170,Euro 5
...,...,...,...,...,...,...,...,...,...,...,...,...
1431,1431,7500,55.672156,20.54400,Petrol,86.0,1.000000,0,1300,3,1025,Euro 4
1432,1432,10845,72.000000,68.64724,Petrol,86.0,0.000000,0,1300,3,1015,Euro 4
1433,1433,8500,55.672156,17.01600,Petrol,86.0,0.000000,0,1300,3,1015,Euro 4
1434,1434,7250,70.000000,68.64724,,86.0,1.000000,0,1300,3,1015,
