In [1]:
#importing libs

import pandas as pd
import numpy as np

In [2]:
#Data read
df = pd.read_csv("./data/car_price.csv")

In [3]:
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [4]:
#Feature Engineering

#1. Drop 2 columns ID & Mileage
df = df.drop(labels = ["ID", "Mileage"], axis = 1)

#2. We will select categories which have less then 100 model count
df = df[df['Model'].isin(df['Model'].value_counts()[df['Model'].value_counts() > 100].index)]

#3 Converting numerical features from object types
df["Levy"] = df["Levy"].replace("-", 0).astype(int)
df["Turbo engine"] = df["Engine volume"].str.contains("Turbo")
df["Turbo engine"] = df["Turbo engine"].replace({True:"Yes", False:"No"})
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "").astype(float)
df["Doors"]= df["Doors"].str.replace("-May", "").str.replace("-Mar", "").replace(">5", 5).astype(float)


#4. Removing the duplicates
df = df.drop_duplicates()

In [5]:
#Seperating Dependent & Independent Data

X = df.drop(labels = ["Price"], axis = 1)
Y = df[["Price"]]

In [6]:
#Defining Types of column

num_cols = ["Levy", "Prod. year", "Engine volume", "Cylinders", "Doors", "Airbags"]
lab_cols = ["Gear box type", "Drive wheels", "Fuel type", "Turbo engine", "Leather interior", "Wheel"]
onehot_cols = ["Manufacturer", "Model", "Category", "Color"]

In [7]:
from sklearn.impute import SimpleImputer #Handling Missing Values
from sklearn.preprocessing import StandardScaler #Feature scaling for num_data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder #Encoding for cat_data

#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [8]:
#Labels code
gear_type = ["Manual", "Automatic", "Tiptronic", "Variator"]
drive_wheels = ["Rear", "Front", "4x4"]
fuel_type = ["LPG", "CNG", "Diesel" , "Petrol" , "Hybrid", "Plug-in Hybrid"]
turbo_engine = ["No", "Yes"]
leather_interior = ["No", "Yes"]
wheel = ["Left wheel", "Right-hand drive"]

In [9]:
#Numerical Pipeline
num_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy= "median")),
        ("scaler", StandardScaler())
    ]
)

In [10]:
#Label Pipeline
lab_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent")),
        ("label", OrdinalEncoder(categories=[gear_type, drive_wheels, fuel_type, turbo_engine, leather_interior, wheel])),
        ("scaler", StandardScaler())
    ]
)

In [11]:
#One Hot Encode Pipeline

onehot_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent")),
        ("onehot", OneHotEncoder())
    ]
)

In [12]:
#Creating the column Transformer

preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_cols),
    ("lab_pipeline", lab_pipeline, lab_cols),
    ("onehot_pipeline", onehot_pipeline, onehot_cols)
])

In [13]:
#Training the data

from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=50)

In [15]:
from scipy.sparse import csr_matrix

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

X_train = pd.DataFrame(X_train.toarray())
X_test = pd.DataFrame(X_test.toarray())

In [16]:

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet #Model training
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error #Model evaluation

In [17]:
X_train.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,-0.109297,1.168573,0.650175,-0.305019,0.086113,1.397899,1.207806,2.400339,1.218828,-0.276042,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.097949,0.468054,-0.412312,-0.305019,0.086113,0.400444,1.207806,-0.209386,0.111786,-0.276042,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.284062,0.234547,-0.867663,-0.305019,0.086113,1.397899,-0.349473,-0.209386,1.218828,-0.276042,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-1.423439,-1.166492,-0.867663,-0.305019,0.086113,-0.597011,-0.349473,-0.209386,1.218828,-0.276042,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.423439,0.935067,-0.108744,-0.305019,0.086113,1.397899,1.207806,-0.209386,0.111786,-0.276042,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [19]:
regression.coef_

array([[-1.88162174e+03,  7.82372381e+03,  5.12531045e+01,
         1.25666095e+02,  5.01373172e+02, -2.50045978e+03,
         2.17769257e+03, -7.86658020e+02, -9.73625825e+01,
         1.73278157e+03, -1.08435658e+03, -1.26472836e+03,
         2.23915102e+15, -5.54419399e+14, -5.54419399e+14,
        -1.98766339e+12, -7.37479780e+15, -1.77683412e+16,
         5.18213722e+14, -7.02161361e+15,  1.46994800e+15,
        -3.35589048e+14, -1.09074198e+15, -4.58528547e+15,
        -3.62838927e+13,  3.25383179e+15,  3.79312606e+15,
         2.71179626e+15, -5.12732101e+15, -7.82747232e+14,
         5.14812440e+15, -5.12732101e+15, -1.31906982e+15,
         5.50130859e+15, -1.31906982e+15, -3.34343721e+15,
         1.58948520e+16, -1.87150155e+15,  5.50130859e+15,
        -1.83720532e+15, -1.87150155e+15,  5.14812440e+15,
         5.14812440e+15,  1.58948520e+16,  1.58948520e+16,
        -5.12732101e+15,  5.50130859e+15, -5.66661528e+15,
        -1.53790017e+15, -1.31906982e+15, -3.34343721e+1

In [20]:
def evaluate_model(true, pred):
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    r2_square = r2_score(true, pred)
    return mae, mse, rmse, r2_square

In [21]:
#Traiing models

models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mae, mse, rmse, r2_square = evaluate_model(y_test, y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Training Performance")
    print("RMSE", rmse)
    print("MSE", mae)
    print("R2 score", r2_square*100)
    
    r2_list.append(r2_square)
    
    print("*"*100)
    

LinearRegression
Model Training Performance
RMSE 11202.416677413852
MSE 7977.8136
R2 score 47.06909188982307
****************************************************************************************************
Lasso
Model Training Performance
RMSE 11195.714670372656
MSE 7971.375100387254
R2 score 47.13240630026054
****************************************************************************************************
Ridge
Model Training Performance
RMSE 11195.08968546976
MSE 7971.879801201442
R2 score 47.13830865248039
****************************************************************************************************
ElasticNet
Model Training Performance
RMSE 12670.91521422114
MSE 8748.589531391292
R2 score 32.2823512908585
****************************************************************************************************


In [22]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']