In [153]:
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [154]:
df = pd.read_csv('car_price_prediction.csv')

In [155]:
random_indices = df.sample(n=10).index

# Set those 10 rows in 'col1' to NaN
df.loc[random_indices, 'Mileage'] = None

In [156]:
random_indices

Int64Index([1641, 1268, 1829, 2079, 770, 1630, 1067, 2316, 296, 421], dtype='int64')

In [157]:
df.head()

Unnamed: 0,Car ID,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,1,Tesla,2016,2.3,Petrol,Manual,114832.0,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190.0,Used,14679.61,5 Series
2,3,Audi,2013,4.5,Electric,Manual,181601.0,New,44402.61,A4
3,4,Tesla,2011,4.1,Diesel,Automatic,68682.0,New,86374.33,Model Y
4,5,Ford,2009,2.6,Diesel,Manual,223009.0,Like New,73577.1,Mustang


In [158]:
df.isna().sum()

Car ID           0
Brand            0
Year             0
Engine Size      0
Fuel Type        0
Transmission     0
Mileage         10
Condition        0
Price            0
Model            0
dtype: int64

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        2500 non-null   int64  
 1   Brand         2500 non-null   object 
 2   Year          2500 non-null   int64  
 3   Engine Size   2500 non-null   float64
 4   Fuel Type     2500 non-null   object 
 5   Transmission  2500 non-null   object 
 6   Mileage       2490 non-null   float64
 7   Condition     2500 non-null   object 
 8   Price         2500 non-null   float64
 9   Model         2500 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 195.4+ KB


In [160]:
numerical_pipeline = Pipeline([('Imputer', KNNImputer(n_neighbors=5)),
                               ('Scaler', StandardScaler())])

categorical_pipeline = Pipeline([('Encoder', OrdinalEncoder()),
                                 ('Imputer', KNNImputer(n_neighbors=5))])

In [161]:
data_pipeline = ColumnTransformer([('numerical', numerical_pipeline, df.select_dtypes(exclude='object').columns),
                                   ('categorical', categorical_pipeline, df.select_dtypes(include='object').columns)])

In [162]:
df_preprocessed = data_pipeline.fit_transform(df)