In [51]:
#importing libs

import pandas as pd
import numpy as np

In [52]:
#Data read
df = pd.read_csv("./data/car_price.csv")

In [53]:
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [54]:
#Feature Engineering

#1. Drop 2 columns ID & Mileage
df = df.drop(labels = ["ID", "Mileage"], axis = 1)

#2. We will select categories which have less then 100 model count
df = df[df['Model'].isin(df['Model'].value_counts()[df['Model'].value_counts() > 100].index)]

#3 Converting numerical features from object types
df["Levy"] = df["Levy"].replace("-", 0).astype(int)
df["Turbo engine"] = df["Engine volume"].str.contains("Turbo")
df["Turbo engine"] = df["Turbo engine"].replace({True:"Yes", False:"No"})
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "").astype(float)
df["Doors"]= df["Doors"].str.replace("-May", "").str.replace("-Mar", "").replace(">5", 5).astype(float)


#4. Removing the duplicates
df = df.drop_duplicates()

In [55]:
#Seperating Dependent & Independent Data

X = df.drop(labels = ["Price"], axis = 1)
Y = df[["Price"]]

In [56]:
#Defining Types of column

num_cols = ["Levy", "Prod. year", "Engine volume", "Cylinders", "Doors", "Airbags"]
lab_cols = ["Gear box type", "Drive wheels", "Fuel type", "Turbo engine", "Leather interior", "Wheel"]
onehot_cols = ["Manufacturer", "Model", "Category", "Color"]

In [57]:
from sklearn.impute import SimpleImputer #Handling Missing Values
from sklearn.preprocessing import StandardScaler #Feature scaling for num_data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder #Encoding for cat_data

#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [58]:
#Labels code
gear_type = ["Manual", "Automatic", "Tiptronic", "Variator"]
drive_wheels = ["Rear", "Front", "4x4"]
fuel_type = ["LPG", "CNG", "Diesel" , "Petrol" , "Hybrid", "Plug-in Hybrid"]
turbo_engine = ["No", "Yes"]
leather_interior = ["No", "Yes"]
wheel = ["Left wheel", "Right-hand drive"]

In [59]:
#Numerical Pipeline
num_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy= "median")),
        ("scaler", StandardScaler())
    ]
)

In [60]:
#Label Pipeline
lab_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent")),
        ("label", OrdinalEncoder(categories=[gear_type, drive_wheels, fuel_type, turbo_engine, leather_interior, wheel])),
        ("scaler", StandardScaler())
    ]
)

In [61]:
#One Hot Encode Pipeline

onehot_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent")),
        ("onehot", OneHotEncoder())
    ]
)

In [62]:
#Creating the column Transformer

preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_cols),
    ("lab_pipeline", lab_pipeline, lab_cols),
    ("onehot_pipeline", onehot_pipeline, onehot_cols)
])

In [63]:
#Training the data

from sklearn.model_selection import train_test_split

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=50)

In [65]:
preprocessor.fit_transform(X_train)

<5833x91 sparse matrix of type '<class 'numpy.float64'>'
	with 93328 stored elements in Compressed Sparse Row format>