In [1]:
# importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# importing our data

df=pd.read_csv('cardekho_imputated.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [3]:
# removing the 'Unnamed: 0' column

df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


### Feature Engineering

Data Cleaning:
- Handling Missing Values
- Handling Duplicates
- Check Data Type
- Understand the Dataset

In [4]:
# checking for null values

df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [5]:
# removing the 'car_name' and 'brand' columns as they are pretty much unnecessary

df.drop(columns=['car_name','brand'], axis=1, inplace=True)
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [6]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

#### Dofferent Types of Features

In [8]:
# Numeric Features

numeric_features=[feature for feature in df.columns if df[feature].dtype!='O']
print(f'Number of Numeric Features : {len(numeric_features)}')
print(f'Numeric Features are :\n{numeric_features}')

Number of Numeric Features : 7
Numeric Features are :
['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']


In [10]:
# Categorical Features

categorical_features=[feature for feature in df.columns if df[feature].dtype=='O']
print(f"Number of Categorical Features : {len(categorical_features)}")
print(f"Categorical Features are :\n{categorical_features}")

Number of Categorical Features : 4
Categorical Features are :
['model', 'seller_type', 'fuel_type', 'transmission_type']


In [11]:
# Discrete Features

discrete_features=[feature for feature in numeric_features if len(df[feature].unique())<=25]
print(f"Number of Discrete Features is {len(discrete_features)}")
print(f"Discrete Features are:\n{discrete_features}")

Number of Discrete Features is 2
Discrete Features are:
['vehicle_age', 'seats']


In [12]:
# Continuous Features

continuous_features=[feature for feature in numeric_features if len(df[feature].unique())>25]
print(f"Number of Continuous Features : {len(continuous_features)}")
print(f"Continuous Features are \n{continuous_features}")

Number of Continuous Features : 5
Continuous Features are 
['km_driven', 'mileage', 'engine', 'max_power', 'selling_price']


#### Dividing our Data into Independent and Dependent Features

In [13]:
X=df.drop(columns=['selling_price'], axis=1)
y=df['selling_price']

In [14]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [15]:
y[:5]

0    120000
1    550000
2    215000
3    226000
4    570000
Name: selling_price, dtype: int64

### Feature Encoding and Scaling

OHE for columns which had lesser unique values and not ordinal

In [16]:
len(df['model'].unique())

120

In [17]:
from sklearn.preprocessing import LabelEncoder
le1=LabelEncoder()
X['model']=le1.fit_transform(X['model'])

In [18]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [20]:
len(df['seller_type'].unique()), len(df['fuel_type'].unique()), len(df['transmission_type'].unique())

(3, 5, 2)

In [21]:
# creating column transformer with 3 types of transformers

numeric_features=X.select_dtypes(exclude='object').columns
onehot_columns=['seller_type', 'fuel_type','transmission_type']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder(drop='first')

preprocessor=ColumnTransformer([
    ("OneHotEncoder",oh_transformer, onehot_columns),
    ("StandardScaler",numeric_transformer, numeric_features)
], remainder='passthrough' # remainder = 'passthrough' means that all other features should be kept as they are
)

In [22]:
X=preprocessor.fit_transform(X)

In [23]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,-0.360667,0.292211,-0.93661,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.666211,-0.01206,-0.496281,0.735736,0.022918,-0.046502,-0.403022


### Model Training and Model Selection

In [26]:
# creating and testing splits

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [25]:
# creating a function to evaluate the model

def evaluate_model(true, predicted):
    mae=mean_absolute_error(y_true=true, y_pred=predicted)
    mse=mean_squared_error(y_true=true, y_pred=predicted)
    rmse=np.sqrt(mse)
    score=r2_score(y_pred=predicted, y_true=true)
    return mae, mse, rmse, score

In [29]:
# beginning model training

models={
    "Random Forest":RandomForestRegressor(),
    "Linear Regression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "KNN":KNeighborsRegressor(),
    "Decision Trees":DecisionTreeRegressor()
}

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train, y_train)

    # make predictions
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    # evaluate train and test dataset
    model_train_mae, model_train_mse, model_train_rmse, model_train_score = evaluate_model(true=y_train, predicted=y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_score = evaluate_model(true=y_test, predicted=y_test_pred)

    # prinitng our results
    print(list(models.values())[i])
    print("Model Performance for Training Set:")
    print(f"MSE: {model_train_mse}")
    print(f"MAE: {model_train_mae}")
    print(f"RMSE: {model_train_rmse}")
    print(f"R2 Score: {model_train_score}")
    print('------------------------------------------------------------------------------------------')
    print("Model Performance for Testing Set:")
    print(f"MSE: {model_test_mse}")
    print(f"MAE: {model_test_mae}")
    print(f"RMSE: {model_test_rmse}")
    print(f"R2 Score: {model_test_score}")
    print("="*50)
    print('\n'*3)

RandomForestRegressor()
Model Performance for Training Set:
MSE: 17626171285.94127
MAE: 39703.21112145482
RMSE: 132763.59171829178
R2 Score: 0.9775268780672558
------------------------------------------------------------------------------------------
Model Performance for Testing Set:
MSE: 84638464560.1009
MAE: 101303.26384725665
RMSE: 290926.9058717342
R2 Score: 0.8997949870022368




LinearRegression()
Model Performance for Training Set:
MSE: 298376743063.129
MAE: 266703.0780207068
RMSE: 546238.7235111851
R2 Score: 0.6195738246285458
------------------------------------------------------------------------------------------
Model Performance for Testing Set:
MSE: 288350900910.69727
MAE: 267345.243047681
RMSE: 536983.1476971109
R2 Score: 0.6586161395548988




Ridge()
Model Performance for Training Set:
MSE: 298377080879.87726
MAE: 266668.86338791624
RMSE: 546239.0327318959
R2 Score: 0.6195733939169152
------------------------------------------------------------------------------------

### Hyper-Parameter Tuning our Best Models

Our best models are as follows:
- RandomForestRegressor
- DecisionTreesRegressor
- KNeighborsRegressor

In [30]:
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

In [31]:
randomcv_models = [('KNN', KNeighborsRegressor(), knn_params),
                   ("RF", RandomForestRegressor(), rf_params)
                   
                   ]

In [32]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 6 candidates, totalling 18 fits




[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END .....................................n_neighbors=10; total time=   0.1s
[CV] END .....................................n_neighbors=10; total time=   0.1s
[CV] END .....................................n_neighbors=10; total time=   0.1s
[CV] END .....................................n_neighbors=20; total time=   0.2s
[CV] END .....................................n_neighbors=20; total time=   0.2s
[CV] END .....................................n_neighbors=20; total time=   0.2s
[CV] END ...................



[CV] END max_depth=10, max_features=8, min_samples_split=20, n_estimators=200; total time=   1.8s
[CV] END max_depth=8, max_features=auto, min_samples_split=8, n_estimators=1000; total time=   0.0s
[CV] END max_depth=8, max_features=auto, min_samples_split=8, n_estimators=1000; total time=   0.0s
[CV] END max_depth=8, max_features=auto, min_samples_split=8, n_estimators=1000; total time=   0.0s
[CV] END max_depth=8, max_features=5, min_samples_split=8, n_estimators=1000; total time=   5.7s
[CV] END max_depth=8, max_features=7, min_samples_split=8, n_estimators=1000; total time=   7.4s
[CV] END max_depth=8, max_features=7, min_samples_split=8, n_estimators=1000; total time=   7.3s
[CV] END max_depth=5, max_features=auto, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END max_d

66 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise Invalid

---------------- Best Params for KNN -------------------
{'n_neighbors': 2}
---------------- Best Params for RF -------------------
{'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 5, 'max_depth': None}


In [37]:
# Retraining the models with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=1000, min_samples_split=2, max_features=5, max_depth=None, 
                                                     n_jobs=-1),
     "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=2, n_jobs=-1)
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_mse, model_train_rmse, model_train_score = evaluate_model(true=y_train, predicted=y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_score = evaluate_model(true=y_test, predicted=y_test_pred)
    
    print(list(models.keys())[i])
    print(list(models.values())[i])
    print("Model Performance for Training Set:")
    print(f"MSE: {model_train_mse}")
    print(f"MAE: {model_train_mae}")
    print(f"RMSE: {model_train_rmse}")
    print(f"R2 Score: {model_train_score}")
    print('------------------------------------------------------------------------------------------')
    print("Model Performance for Testing Set:")
    print(f"MSE: {model_test_mse}")
    print(f"MAE: {model_test_mae}")
    print(f"RMSE: {model_test_rmse}")
    print(f"R2 Score: {model_test_score}")
    
    
    print('='*35)
    print('\n')

Random Forest Regressor
RandomForestRegressor(max_features=5, n_estimators=1000, n_jobs=-1)
Model Performance for Training Set:
MSE: 15526374144.800552
MAE: 38278.446780810234
RMSE: 124604.87207489341
R2 Score: 0.9802040900619288
------------------------------------------------------------------------------------------
Model Performance for Testing Set:
MSE: 90901648878.40492
MAE: 102427.03115944673
RMSE: 301499.0031134513
R2 Score: 0.8923798895133477


K-Neighbors Regressor
KNeighborsRegressor(n_jobs=-1, n_neighbors=2)
Model Performance for Training Set:
MSE: 38355302340.30271
MAE: 63689.90634192767
RMSE: 195845.09782045276
R2 Score: 0.9510975258167156
------------------------------------------------------------------------------------------
Model Performance for Testing Set:
MSE: 113826258370.10123
MAE: 115921.87905528159
RMSE: 337381.4730688412
R2 Score: 0.8652390286290759


