In [1]:
import pandas as pd

# Read dataset
df = pd.read_csv("data/Toyota.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [2]:
# a. Display shape and summary and count of missing values in the dataset
shape = df.shape  # Get the shape of the dataframe
summary = df.describe(include='all')  # Summary statistics of all columns
missing_values_count = df.isnull().sum()  # Count of missing values per column
missing_values_count

Unnamed: 0      0
Price           0
Age           100
KM              0
FuelType      100
HP              0
MetColor      150
Automatic       0
CC              0
Doors           0
Weight          0
dtype: int64

In [3]:
# b. Remove duplicate records
df = df.drop_duplicates()  # Remove duplicate records
df.isnull().sum()

Unnamed: 0      0
Price           0
Age           100
KM              0
FuelType      100
HP              0
MetColor      150
Automatic       0
CC              0
Doors           0
Weight          0
dtype: int64

In [5]:
# c. Clean the dataset - Replace missing or non-numeric values in each column with appropriate value
# Replace non-numeric entries with NaN for columns expected to be numeric
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')  # Convert to numeric, invalid parsing will be NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['KM'] = pd.to_numeric(df['KM'], errors='coerce')
df['HP'] = pd.to_numeric(df['HP'], errors='coerce')
df['CC'] = pd.to_numeric(df['CC'], errors='coerce')
df['Weight'] = pd.to_numeric(df['Weight'], errors='coerce')

In [6]:
# Now replace NaN values with the median for numerical columns
df['Price'] = df['Price'].fillna(df['Price'].median())
df['Age'] = df['Age'].fillna(df['Age'].median())
df['KM'] = df['KM'].fillna(df['KM'].median())
df['HP'] = df['HP'].fillna(df['HP'].median())
df['CC'] = df['CC'].fillna(df['CC'].median())
df['Weight'] = df['Weight'].fillna(df['Weight'].median())

In [7]:
# For categorical columns (like FuelType, MetColor, Automatic, Doors), replace NaN with the mode (most frequent value)
df['FuelType'] = df['FuelType'].fillna(df['FuelType'].mode()[0])
df['MetColor'] = df['MetColor'].fillna(df['MetColor'].mode()[0])
df['Automatic'] = df['Automatic'].fillna(df['Automatic'].mode()[0])
df['Doors'] = df['Doors'].fillna(df['Doors'].mode()[0])


In [8]:
df.isnull().sum()

Unnamed: 0    0
Price         0
Age           0
KM            0
FuelType      0
HP            0
MetColor      0
Automatic     0
CC            0
Doors         0
Weight        0
dtype: int64

In [9]:
# d. Convert the datatype of 'MetColor' and 'Automatic' columns to 'object' type
df['MetColor'] = df['MetColor'].astype('object')
df['Automatic'] = df['Automatic'].astype('object')
df.head()

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986.0,Diesel,90.0,1.0,0,2000,three,1165
1,1,13750,23.0,72937.0,Diesel,90.0,1.0,0,2000,3,1165
2,2,13950,24.0,41711.0,Diesel,90.0,1.0,0,2000,3,1165
3,3,14950,26.0,48000.0,Diesel,90.0,0.0,0,2000,3,1165
4,4,13750,30.0,38500.0,Diesel,90.0,0.0,0,2000,3,1170
