# Used Car Price Prediction

## 1) Problem statement.

* This dataset comprises used cars sold on cardehko.com in India as well as important features of these cars.
* If user can predict the price of the car based on input features.
* Prediction results can be used to give new seller the price suggestion based on market condition.

## 2) Data Collection.
* The Dataset is collected from scrapping from cardheko webiste
* The data consists of 13 column and 15411 rows.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
df = pd.read_csv('cardekho_imputated.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


# Data cleaning

* Handling Missing values 
* Handling Duplicates
* Check data type
* Understand the dataset

In [4]:
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [5]:
## there is no null values in our dataset 

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.drop("Unnamed: 0" ,axis=1,inplace=True)

In [8]:
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_name           15411 non-null  object 
 1   brand              15411 non-null  object 
 2   model              15411 non-null  object 
 3   vehicle_age        15411 non-null  int64  
 4   km_driven          15411 non-null  int64  
 5   seller_type        15411 non-null  object 
 6   fuel_type          15411 non-null  object 
 7   transmission_type  15411 non-null  object 
 8   mileage            15411 non-null  float64
 9   engine             15411 non-null  int64  
 10  max_power          15411 non-null  float64
 11  seats              15411 non-null  int64  
 12  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 1.5+ MB


In [10]:
df["model"].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [11]:
numerical_features = [features  for features in df.columns if df[features].dtype !='O']
cate_features = [features  for features in df.columns if df[features].dtype =='O']


In [12]:
cate_features, len(cate_features),numerical_features , len(numerical_features)

(['car_name',
  'brand',
  'model',
  'seller_type',
  'fuel_type',
  'transmission_type'],
 6,
 ['vehicle_age',
  'km_driven',
  'mileage',
  'engine',
  'max_power',
  'seats',
  'selling_price'],
 7)

In [13]:
## seprate the datasets onn basic of the dependent, independent features
x = df.drop("selling_price" , axis=1)

In [14]:
y = df["selling_price"]

In [15]:
y

0         120000
1         550000
2         215000
3         226000
4         570000
          ...   
15406     250000
15407     925000
15408     425000
15409    1225000
15410    1200000
Name: selling_price, Length: 15411, dtype: int64

In [16]:
x.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


## Feature Encoding and Scaling
**One Hot Encoding for Columns which had lesser unique values and not ordinal**
* One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction.

In [17]:
## label encoder

In [18]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
x['model']=le.fit_transform(x['model'])

In [19]:
x.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Maruti Alto,Maruti,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Hyundai Grand,Hyundai,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,Hyundai i20,Hyundai,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Maruti Alto,Maruti,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ford Ecosport,Ford,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [20]:
#len(df['seller_type'].unique()),len(df['fuel_type'].unique()),len(df['transmission_type'].unique())
for features in cate_features:
    print(f"{features} total unique items :" ,len(df[features].unique()))

car_name total unique items : 121
brand total unique items : 32
model total unique items : 120
seller_type total unique items : 3
fuel_type total unique items : 5
transmission_type total unique items : 2


In [21]:
x.drop(['car_name'	,'brand'] , axis=1 , inplace=True)

In [22]:
x.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [23]:
num_features = x.select_dtypes(exclude="object").columns
onehot_columns = ['seller_type','fuel_type','transmission_type']

In [24]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
ohe = OneHotEncoder(drop='first')
scaler = StandardScaler()

In [25]:
from sklearn.compose import ColumnTransformer

In [26]:
preprocessor = ColumnTransformer(
    [
        ('ohe' , ohe , onehot_columns),
        ('scaler' , scaler, num_features)
    ], remainder= 'passthrough'
)

In [27]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=20,test_size=0.30)

In [28]:
x_train = preprocessor.fit_transform(x_train)
x_test  = preprocessor.transform(x_test)


In [29]:
pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.831165,0.318028,0.113017,-0.163805,-0.051156,0.187289,-0.406102
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.960634,0.984611,0.113017,-0.643831,0.197588,0.485352,-0.406102
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.539523,-1.681721,-0.885897,-0.262210,-0.556296,-0.429481,-0.406102
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.905502,1.317903,0.422448,0.203416,0.210982,0.074790,-0.406102
4,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.740105,0.318028,-0.414834,-0.142203,-0.556296,-0.338091,-0.406102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10782,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.354922,-1.015138,-0.742467,0.409827,-0.556296,-0.400719,-0.406102
10783,1.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.354922,-0.681846,0.440650,1.847507,-0.458712,-0.611798,-0.406102
10784,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.023385,-0.348555,-0.075735,1.350679,-0.554383,-0.542211,0.825448
10785,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.539523,-1.015138,-0.611086,-0.262210,-0.556296,-0.430177,-0.406102


In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [31]:
def evalueate_metrix(true,predicted):
     mae = mean_absolute_error(true, predicted)
     mse = mean_squared_error(true, predicted)
     rmse = np.sqrt(mean_squared_error(true, predicted))
     r2_square = r2_score(true, predicted)
     return mae, rmse, r2_square
    

In [32]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "adaboost_regressior":AdaBoostRegressor(),
    "xgbr":X
   
}

In [33]:
model_evaluation_list =[]
for name,model in models.items():
    print(name)
    model.fit(x_train,y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    # evaluaatin of training and testing data
    model_train_mae , model_train_rmse, model_train_r2 = evalueate_metrix(y_train,y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evalueate_metrix(y_test,y_test_pred)
    model_evaluation_list.append([name ,'train', model_train_mae , model_train_rmse, model_train_r2])
    model_evaluation_list.append([name, 'test',model_test_mae , model_test_rmse, model_test_r2])
    

Linear Regression
Lasso
Ridge
K-Neighbors Regressor
Decision Tree
Random Forest Regressor
adaboost_regressior


In [34]:
evaluation_model_table = pd.DataFrame(
    model_evaluation_list,
    columns=['Model',"type", 'MAE', 'RMSE', 'R2']
)


In [35]:
evaluation_model_table

Unnamed: 0,Model,type,MAE,RMSE,R2
0,Linear Regression,train,275113.152081,577925.300402,0.610492
1,Linear Regression,test,269015.653534,454127.314142,0.689371
2,Lasso,train,275111.537916,577925.305767,0.610492
3,Lasso,test,269011.184662,454126.455669,0.689372
4,Ridge,train,275072.01359,577926.428102,0.61049
5,Ridge,test,268934.851513,454109.305101,0.689395
6,K-Neighbors Regressor,train,91590.667007,338875.027818,0.866078
7,K-Neighbors Regressor,test,111880.298443,257996.617207,0.899743
8,Decision Tree,train,4455.6565,18501.851819,0.999601
9,Decision Tree,test,128093.775231,300438.564618,0.864044


In [36]:
#Initialize few parameter for Hyperparamter tuning
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}
ada_reg = {
    "n_estimators": [200, 300, 500],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "loss": ["linear", "square", "exponential"],
    "estimator__max_depth": [3, 4, 5],
    "estimator__min_samples_leaf": [1, 3, 5]


}


In [37]:
randomcv = [
    ('knn' , KNeighborsRegressor() ,knn_params),
    ('rf' ,RandomForestRegressor(),rf_params),
    ('ada' , AdaBoostRegressor(estimator=DecisionTreeRegressor(),
             random_state=42),ada_reg)
]

In [38]:
randomcv

[('knn', KNeighborsRegressor(), {'n_neighbors': [2, 3, 10, 20, 40, 50]}),
 ('rf',
  RandomForestRegressor(),
  {'max_depth': [5, 8, 15, None, 10],
   'max_features': [5, 7, 8],
   'min_samples_split': [2, 8, 15, 20],
   'n_estimators': [100, 200, 500, 1000]}),
 ('ada',
  AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=42),
  {'n_estimators': [200, 300, 500],
   'learning_rate': [0.01, 0.03, 0.05, 0.1],
   'loss': ['linear', 'square', 'exponential'],
   'estimator__max_depth': [3, 4, 5],
   'estimator__min_samples_leaf': [1, 3, 5]})]

In [39]:
from sklearn.model_selection import RandomizedSearchCV

In [40]:
from sklearn.model_selection import RandomizedSearchCV

best_model = {}

for name, model, param in randomcv:
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=param,
        n_iter=5,
        scoring='accuracy',
        n_jobs= -1,
        refit=True,
        cv=5,
        verbose=3
    )
    
    random.fit(x_train, y_train)
    
    best_model[name] = random.best_params_

# print best parameters
for model_name, params in best_model.items():
    print(f"---------------- Best Params for {model_name} ----------------")
    print("This is the best parameter of this model:", params)
    print()


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
---------------- Best Params for knn ----------------
This is the best parameter of this model: {'n_neighbors': 2}

---------------- Best Params for rf ----------------
This is the best parameter of this model: {'n_estimators': 500, 'min_samples_split': 20, 'max_features': 5, 'max_depth': 5}

---------------- Best Params for ada ----------------
This is the best parameter of this model: {'n_estimators': 500, 'loss': 'exponential', 'learning_rate': 0.05, 'estimator__min_samples_leaf': 3, 'estimator__max_depth': 4}



In [41]:
# reataing the best model with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, min_samples_split=2, max_depth=None, 
                                                     n_jobs=-1),
     "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=10, n_jobs=-1),
    "ada_reg" : AdaBoostRegressor(n_estimators= 500, loss= 'square', learning_rate = 0.1)
    
}
model_evaluation_list =[]
for name,model in models.items():
    print(name)
    model.fit(x_train,y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    # evaluaatin of training and testing data
    model_train_mae , model_train_rmse, model_train_r2 = evalueate_metrix(y_train,y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evalueate_metrix(y_test,y_test_pred)
    model_evaluation_list.append([name ,'train', model_train_mae , model_train_rmse, model_train_r2])
    model_evaluation_list.append([name ,'test',model_test_mae , model_test_rmse, model_test_r2])
    


Random Forest Regressor
K-Neighbors Regressor
ada_reg


In [42]:
results_df = pd.DataFrame(
    model_evaluation_list,
    columns=["Model", "Dataset", "MAE", "RMSE", "R2"]
)
results_df


Unnamed: 0,Model,Dataset,MAE,RMSE,R2
0,Random Forest Regressor,train,40486.394859,142136.800946,0.976439
1,Random Forest Regressor,test,99889.999453,213705.245253,0.931211
2,K-Neighbors Regressor,train,105780.459117,375834.052049,0.835273
3,K-Neighbors Regressor,test,112361.242971,259368.751143,0.898674
4,ada_reg,train,337070.634513,451344.142682,0.762432
5,ada_reg,test,340452.365582,463526.661041,0.676379


In [43]:
x.head()


Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [None]:
custom_car = {}
for i in range(1):
    custom_car['model'] = int(input('Enter the model number: '))
    custom_car['vehicle_age'] = int(input('Enter the vehicle age (in years): '))
    custom_car['km_driven'] = int(input('Enter the kilometers driven: '))
    custom_car['seller_type'] = input('Enter the seller type (e.g., Individual, Dealer): ')
    custom_car['fuel_type'] = input('Enter the fuel type (e.g., Petrol, Diesel): ')
    custom_car['transmission_type'] = input('Enter the transmission type (e.g., Manual, Automatic): ')
    custom_car['mileage'] = float(input('Enter the mileage: '))  # Changed to float as mileage can be decimal
    custom_car['engine'] = int(input('Enter the engine capacity (CC): '))
    custom_car['max_power'] = float(input('Enter the max power (bhp): ')) # Changed to float as power can be decimal
    custom_car['seats'] = int(input('Enter the number of seats: '))
    custom_car = pd.DataFrame([custom_car])
    print(custom_car)

In [None]:
custom_car

In [None]:
custom_transformed = preprocessor.transform(custom_car)
for name,model in models.items():
    selling_price = model.predict(custom_transformed)
    print(f"sellling price of the car acc to this mmodel is {model}" , selling_price[0])
    