In [33]:
#Model Training

import pandas as pd

In [34]:
df = pd.read_csv("./data/car_price.csv")

In [35]:
#Major Feature Engineering Steps

#1. Drop 2 columns ID & Mileage
df = df.drop(labels = ["ID", "Mileage"], axis = 1)

#2. We will select categories which have less then 100 model count
df = df[df['Model'].isin(df['Model'].value_counts()[df['Model'].value_counts() > 100].index)]

#3 Converting numerical features from object types
df["Levy"] = df["Levy"].replace("-", 0).astype(int)
df["Turbo engine"] = df["Engine volume"].str.contains("Turbo")
df["Turbo engine"] = df["Turbo engine"].replace({True:"Yes", False:"No"})
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "").astype(float)
df["Doors"]= df["Doors"].str.replace("-May", "").str.replace("-Mar", "").replace(">5", 5).astype(float)


#4. Removing the duplicates
df = df.drop_duplicates()


In [36]:
#Dependent & Independent Features

X = df.drop(labels = ["Price"], axis = 1)
Y = df[["Price"]]

In [37]:
#Data for Categorical & Numerical

cat_cols = X.select_dtypes(include = "object").columns
num_cols = X.select_dtypes(exclude = "object").columns

In [38]:
#Simple imputer technique for handling Missing Value

from sklearn.impute import SimpleImputer #Handling Missing Values
from sklearn.preprocessing import StandardScaler #Feature scaling for num_data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder #Encoding for cat_data

#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [39]:
df

Unnamed: 0,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Turbo engine
0,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,6.0,Automatic,4x4,4.0,Left wheel,Silver,12,No
2,8467,0,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,4.0,Variator,Front,4.0,Right-hand drive,Black,2,No
3,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,4.0,Automatic,4x4,4.0,Left wheel,White,0,No
4,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,4.0,Automatic,Front,4.0,Left wheel,Silver,4,No
5,39493,891,HYUNDAI,Santa FE,2016,Jeep,Yes,Diesel,2.0,4.0,Automatic,Front,4.0,Left wheel,White,4,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19229,50,0,TOYOTA,Prius,2008,Hatchback,No,Hybrid,1.5,4.0,Automatic,Front,4.0,Left wheel,Silver,6,No
19230,470,645,TOYOTA,Prius,2011,Hatchback,Yes,Hybrid,1.8,4.0,Automatic,Front,4.0,Left wheel,Silver,12,No
19233,15681,831,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,4.0,Tiptronic,Front,4.0,Left wheel,Red,8,No
19234,26108,836,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2.0,4.0,Automatic,Front,4.0,Left wheel,Grey,4,No


In [40]:
#Numerical Pipeline
num_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy= "median")),
        ("scaler", StandardScaler())
    ]
)

In [41]:
gear_type = ["Manual", "Automatic", "Tiptronic", "Variator"]
drive_wheels = ["Rear", "Front", "4x4"]
fuel_type = ["LPG", "CNG", "Diesel" , "Petrol" , "Hybrid", "Plug-in Hybrid"]
turbo_engine = ["No", "Yes"]
leather_interior = ["No", "Yes"]
wheel = ["Left wheel", "Right wheel"]

In [42]:
one_hot_cols = ["Manufacturer", "Model", "Category", "Color"]
label_cols = ["Gear box type", "Drive wheels", "Fuel Type", "Turbo engine", "Leather Interior", "Wheel"]


#Manufacturer --> One-hot-encoding
#Model --> One-hot-encoding
#Category --> One-hot-encoding
#Color --> One-hot-encoding
#Leather Interior --> Binary encoding
#Gear box type --> Label encoding
#Drive wheels --> Label encoding
#Wheel --> Binary encoding
#Turbo engine --> Binary encoding
#Fuel Type ---> Label encoding

In [43]:
#Categorical Pipeline

cat_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal", OrdinalEncoder(categories=[gear_type, drive_wheels, fuel_type, turbo_engine, leather_interior, wheel])),
        ("scaler", StandardScaler()),
        ("one_hot", OneHotEncoder(categories = df[one_hot_cols]))
        
    ]
)

In [44]:
#Preprocessor

preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_cols),
    ("cat_pipeline", cat_pipeline, cat_cols)
])


In [45]:
## Training the data

from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=50)

In [47]:
X_train.shape

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).

In [48]:
X_train.shape

(5833, 16)

In [49]:
X.shape

(8333, 16)