In [1]:
import pandas as pd

# Read dataset
df = pd.read_csv("data/Toyota.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [2]:
# a. Remove missing values
df = df.dropna()  # Remove all rows with missing values
df  # 🔹 Data after removing missing values

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
5,5,12950,32.0,61000,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1425,1425,7950,80.0,??,Petrol,86,1.0,0,1300,4,1000
1429,1429,8950,78.0,24000,Petrol,86,1.0,1,1300,5,1065
1430,1430,8450,80.0,23000,Petrol,86,0.0,0,1300,3,1015
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015


In [3]:
# b. Set Doors value to uniform format (Ensure all values are numeric or consistent)
df['Doors'] = df['Doors'].replace({'three': 3, 'four': 4, 'five': 5})
df['Doors'] = pd.to_numeric(df['Doors'], errors='coerce')  # Convert to numeric values
df  # 🔹 Data after setting Doors value to uniform format

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,3,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
5,5,12950,32.0,61000,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1425,1425,7950,80.0,??,Petrol,86,1.0,0,1300,4,1000
1429,1429,8950,78.0,24000,Petrol,86,1.0,1,1300,5,1065
1430,1430,8450,80.0,23000,Petrol,86,0.0,0,1300,3,1015
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015


In [4]:
# c. Provide concise summary of all numeric variables
summary = df.describe()  # Summary of numeric columns
summary  # 🔹 Summary of numeric variables

Unnamed: 0.1,Unnamed: 0,Price,Age,MetColor,Automatic,CC,Doors,Weight
count,1111.0,1111.0,1111.0,1111.0,1111.0,1111.0,1111.0,1111.0
mean,707.868587,10774.9973,55.585959,0.675068,0.053105,1569.463546,4.063006,1073.870387
std,414.02375,3699.695538,18.829188,0.468561,0.224344,185.935691,0.955586,52.610513
min,0.0,4350.0,1.0,0.0,0.0,1300.0,2.0,1000.0
25%,347.5,8450.0,43.0,0.0,0.0,1400.0,3.0,1045.0
50%,703.0,9900.0,60.0,1.0,0.0,1600.0,4.0,1070.0
75%,1069.5,11950.0,70.0,1.0,0.0,1600.0,5.0,1090.0
max,1435.0,31275.0,80.0,1.0,1.0,2000.0,5.0,1615.0


In [5]:
# d. Remove all duplicate records
df = df.drop_duplicates()  # Remove duplicate rows
df  # 🔹 Data after removing duplicates

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,3,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
5,5,12950,32.0,61000,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1425,1425,7950,80.0,??,Petrol,86,1.0,0,1300,4,1000
1429,1429,8950,78.0,24000,Petrol,86,1.0,1,1300,5,1065
1430,1430,8450,80.0,23000,Petrol,86,0.0,0,1300,3,1015
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015


In [6]:
# e. Get dummies for categorical data (Fuel type - One hot Encoding)
df = pd.get_dummies(df, columns=['FuelType'], drop_first=True)  # One-hot encoding for 'FuelType'
df  # 🔹 Data after One-Hot Encoding for FuelType

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,HP,MetColor,Automatic,CC,Doors,Weight,FuelType_Diesel,FuelType_Petrol
0,0,13500,23.0,46986,90,1.0,0,2000,3,1165,True,False
1,1,13750,23.0,72937,90,1.0,0,2000,3,1165,True,False
3,3,14950,26.0,48000,90,0.0,0,2000,3,1165,True,False
4,4,13750,30.0,38500,90,0.0,0,2000,3,1170,True,False
5,5,12950,32.0,61000,90,0.0,0,2000,3,1170,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1425,1425,7950,80.0,??,86,1.0,0,1300,4,1000,False,True
1429,1429,8950,78.0,24000,86,1.0,1,1300,5,1065,False,True
1430,1430,8450,80.0,23000,86,0.0,0,1300,3,1015,False,True
1432,1432,10845,72.0,??,86,0.0,0,1300,3,1015,False,True
