# Used Car Price Prediction

## 1) Problem statement.

* This dataset comprises used cars sold on cardehko.com in India as well as important features of these cars.
* If user can predict the price of the car based on input features.
* Prediction results can be used to give new seller the price suggestion based on market condition.

## 2) Data Collection.
* The Dataset is collected from scrapping from cardheko webiste
* The data consists of 13 column and 15411 rows.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("cardekho_imputated.csv")
df.head(1)

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000


In [3]:
df.drop("Unnamed: 0", inplace=True, axis = 1)
df.head(1)

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000


In [4]:
df.columns

Index(['car_name', 'brand', 'model', 'vehicle_age', 'km_driven', 'seller_type',
       'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power',
       'seats', 'selling_price'],
      dtype='object')

## Data Cleaning
#### Handling Missing values

In [5]:
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

#### Remove Unnecessary Columns

In [6]:
df.drop(columns=['car_name', 'brand'], axis=1, inplace=True)
df.head(1)
# 'car_name', 'brand', 'model' are the repeated columns. so we keep one (say, model) out of all.
# 'model' can be unique but 'brand' can be repeated many times. And price basically depends on model.

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000


In [7]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [8]:
df['seller_type'].value_counts()

seller_type
Dealer              9539
Individual          5699
Trustmark Dealer     173
Name: count, dtype: int64

In [9]:
df['seller_type'] = df['seller_type'].map({'Trustmark Dealer':0,'Individual':1,'Dealer':2})

In [10]:
df['transmission_type'].value_counts()

transmission_type
Manual       12225
Automatic     3186
Name: count, dtype: int64

In [11]:
df['transmission_type'] = df['transmission_type'].map({'Manual':0, 'Automatic':1})

In [12]:
df['fuel_type'].value_counts()

fuel_type
Petrol      7643
Diesel      7419
CNG          301
LPG           44
Electric       4
Name: count, dtype: int64

In [13]:
df['fuel_type'] = df['fuel_type'].map({'Petrol':4,'Diesel':3,'CNG':2,'LPG':1,'Electric':0})

In [14]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
df['model']=label_encoder.fit_transform(df['model'])
df.head(2)

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,7,9,120000,1,4,0,19.7,796,46.3,5,120000
1,54,5,20000,1,4,0,18.9,1197,82.0,5,550000


In [15]:
import pickle
with open("label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder,file)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model              15411 non-null  int64  
 1   vehicle_age        15411 non-null  int64  
 2   km_driven          15411 non-null  int64  
 3   seller_type        15411 non-null  int64  
 4   fuel_type          15411 non-null  int64  
 5   transmission_type  15411 non-null  int64  
 6   mileage            15411 non-null  float64
 7   engine             15411 non-null  int64  
 8   max_power          15411 non-null  float64
 9   seats              15411 non-null  int64  
 10  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(9)
memory usage: 1.3 MB


# Train-Test Split

In [17]:
# Indpendent and dependent features
X = df.drop(['selling_price'], axis=1)
y = df['selling_price']

In [18]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Scaler

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
import pickle
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

# Model Regressor Training

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [27]:
## Beginning Model Training
models = {"Linear Regression": LinearRegression(),
        "Lasso": Lasso(),
        "Ridge": Ridge(),
        "K-Neighbors Regressor": KNeighborsRegressor(),
        "Decision Tree Regressor": DecisionTreeRegressor(),
        "Random Forest Regressor": RandomForestRegressor()}

for name, model in models.items():
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Performance Metrices
    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse = root_mean_squared_error(y_test, y_test_pred)
    r2_square = r2_score(y_test, y_test_pred)

    print(name)
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2_square:.4f}")
    
    print('='*35)
    print('\n')

Linear Regression
MAE: 279577.58, RMSE: 503376.55, R²: 0.6634


Lasso
MAE: 279577.11, RMSE: 503376.20, R²: 0.6634


Ridge
MAE: 279560.60, RMSE: 503372.32, R²: 0.6634


K-Neighbors Regressor
MAE: 115297.22, RMSE: 255720.15, R²: 0.9131


Decision Tree Regressor
MAE: 126371.03, RMSE: 310207.95, R²: 0.8722


Random Forest Regressor
MAE: 101691.16, RMSE: 227067.58, R²: 0.9315




In [23]:
# KNN, Random forest both performed well for train and test data.

In [28]:
model = RandomForestRegressor()
model.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
import pickle
with open("best_model.pkl", "wb") as file:
    pickle.dump(model, file)

# Hyperparamter tuning

In [None]:
# Hyperparameter Tuning wrt knn
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}

model = KNeighborsRegressor()

from sklearn.model_selection import RandomizedSearchCV # RandomizedSearchCV takes less time than GridSearchCV
random_knn = RandomizedSearchCV(estimator=model, param_distributions=knn_params, n_iter=100, cv=3, verbose=2, n_jobs=-1)
random_knn.fit(X_train, y_train)
random_knn.best_params_

Fitting 3 folds for each of 6 candidates, totalling 18 fits


{'n_neighbors': 10}

: 

In [None]:
# Hyperparameter Tuning wrt rf
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

model = RandomForestRegressor()

from sklearn.model_selection import RandomizedSearchCV # RandomizedSearchCV takes less time than GridSearchCV
random_rf = RandomizedSearchCV(estimator=model, param_distributions=rf_params, n_iter=100, cv=3, verbose=2, n_jobs=-1)
random_rf.fit(X_train, y_train)
random_rf.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'max_features': 7,
 'max_depth': 15}

: 

In [None]:
## Retraining the models with best parameters
models = {"Random Forest Regressor": RandomForestRegressor(n_estimators=1000, min_samples_split=2, max_features=7, max_depth=15, n_jobs=-1),
          "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=10, n_jobs=-1)}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Performance Metrices
    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse = root_mean_squared_error(y_test, y_test_pred)
    r2_square = r2_score(y_test, y_test_pred)


    print('Model performance for Test set')
    print("- Mean Squared Error:", mse)
    print("- Root Mean Squared Error:", rmse)
    print("- Mean Absolute Error:", mae)
    print("- R2 Score:", r2_square)
    
    print('='*35)
    print('\n')

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 132995.7526
- Mean Absolute Error: 54351.7728
- R2 Score: 0.9782
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 208909.1061
- Mean Absolute Error: 96936.6041
- R2 Score: 0.9420


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 364485.7455
- Mean Absolute Error: 104563.3112
- R2 Score: 0.8362
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 265043.0652
- Mean Absolute Error: 118507.0467
- R2 Score: 0.9067




: 

### Prediction on New_Data

In [None]:
df = pd.DataFrame({
        'car_name':["Alto"],
        'brand':["brand"],
        'model':["Alto"],
        'vehicle_age':[4],
        'km_driven':[344],
        'seller_type':["Individual"],
        'fuel_type':["Petrol"],
        'transmission_type':["Automatic"],
        'mileage':[12.3],
        'engine':[700],
        'max_power':[43.4],
        'seats':[3]
    })

# Drop unused
df.drop(columns=['car_name', 'brand'], axis=1, inplace=True)

# Encode categorical values
df['seller_type'] = df['seller_type'].map({'Trustmark Dealer':0, 'Individual':1, 'Dealer':2})
df['transmission_type'] = df['transmission_type'].map({'Manual':0, 'Automatic':1})
df['fuel_type'] = df['fuel_type'].map({'Petrol':4,'Diesel':3,'CNG':2,'LPG':1,'Electric':0})
df['model'] = label_encoder.transform(df['model'])

In [None]:
df

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,4,344,1,4,1,12.3,700,43.4,3


In [None]:
df_scaled = scaler.transform(df)
prediction = model.predict(df_scaled)

In [None]:
print(prediction)

[310495.]
