In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [100]:
df = pd.read_csv('Data\cars_price.csv', index_col=0)
df.head()

Unnamed: 0,make,model,priceUSD,year,condition,mileage(kilometers),fuel_type,volume(cm3),color,transmission,drive_unit,segment
0,honda,accord,565,1993,with mileage,960015.0,petrol,2000.0,black,mechanics,front-wheel drive,D
1,ford,fusion,5550,2008,with mileage,172000.0,diesel,1400.0,silver,mechanics,front-wheel drive,M
2,nissan,teana,8300,2008,with mileage,223000.0,petrol,2500.0,purple,auto,front-wheel drive,D
3,volkswagen,fox,3300,2005,with mileage,140000.0,petrol,1200.0,blue,mechanics,front-wheel drive,A
4,nissan,primera,2450,2002,with damage,413000.0,diesel,2200.0,burgundy,mechanics,front-wheel drive,D


In [101]:
def details(df):
  print("Description of training set")
  print("Shape: \t\t\t", df.shape)
  print("#NaNs: \t\t\t", df.isna().sum().sum()) 
  
details(df)

Description of training set
Shape: 			 (40000, 12)
#NaNs: 			 5130


In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 0 to 39999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   make                 40000 non-null  object 
 1   model                40000 non-null  object 
 2   priceUSD             40000 non-null  int64  
 3   year                 40000 non-null  int64  
 4   condition            40000 non-null  object 
 5   mileage(kilometers)  40000 non-null  float64
 6   fuel_type            40000 non-null  object 
 7   volume(cm3)          39970 non-null  float64
 8   color                40000 non-null  object 
 9   transmission         40000 non-null  object 
 10  drive_unit           38655 non-null  object 
 11  segment              36245 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 4.0+ MB


In [103]:
# Columns containing NAN
def ctd(df):
  columns = df.columns[df.isna().any()].tolist()
  return columns
ctd(df)

['volume(cm3)', 'drive_unit', 'segment']

In [104]:
df.dropna(inplace=True)
details(df)

Description of training set
Shape: 			 (35016, 12)
#NaNs: 			 0


In [105]:
import datetime
df['year']=datetime.datetime.now().year-df['year']
df.head()

Unnamed: 0,make,model,priceUSD,year,condition,mileage(kilometers),fuel_type,volume(cm3),color,transmission,drive_unit,segment
0,honda,accord,565,27,with mileage,960015.0,petrol,2000.0,black,mechanics,front-wheel drive,D
1,ford,fusion,5550,12,with mileage,172000.0,diesel,1400.0,silver,mechanics,front-wheel drive,M
2,nissan,teana,8300,12,with mileage,223000.0,petrol,2500.0,purple,auto,front-wheel drive,D
3,volkswagen,fox,3300,15,with mileage,140000.0,petrol,1200.0,blue,mechanics,front-wheel drive,A
4,nissan,primera,2450,18,with damage,413000.0,diesel,2200.0,burgundy,mechanics,front-wheel drive,D


In [48]:
def description(df):
  for cols in df.columns:
    print('Unique values for ',cols,' : ',len(df[cols].unique()))
description(df)

Unique values for  make  :  51
Unique values for  model  :  458
Unique values for  priceUSD  :  2328
Unique values for  year  :  62
Unique values for  condition  :  3
Unique values for  mileage(kilometers)  :  5755
Unique values for  fuel_type  :  2
Unique values for  volume(cm3)  :  352
Unique values for  color  :  13
Unique values for  transmission  :  2
Unique values for  drive_unit  :  4
Unique values for  segment  :  9


## Model

In [109]:
features =[ 'year', 'condition', 'mileage(kilometers)',
       'fuel_type', 'volume(cm3)', 'color', 'transmission', 'drive_unit',
       'segment']
categorical = ['condition',
       'fuel_type', 'color', 'transmission', 'drive_unit','segment']
numerical = ['priceUSD','year','mileage(kilometers)','volume(cm3)']

In [110]:
from sklearn.preprocessing import LabelEncoder

In [111]:
def encoding(df,categorical):
  label = LabelEncoder()
  df2= df
  for feature in categorical:
    df2[feature] =label.fit_transform(df[feature])
  return df2

In [4]:
cat_list = ['Sun', 'Sun', 'Wed', 'Mon', 'Mon']
encoded_data, mapping_index = pd.Series(cat_list).factorize()
encoded_data, mapping_index

(array([0, 0, 1, 2, 2], dtype=int64),
 Index(['Sun', 'Wed', 'Mon'], dtype='object'))

In [112]:
dataset = encoding(df,categorical)
dataset.head()

Unnamed: 0,make,model,priceUSD,year,condition,mileage(kilometers),fuel_type,volume(cm3),color,transmission,drive_unit,segment
0,honda,accord,565,27,2,960015.0,1,2000.0,0,1,1,3
1,ford,fusion,5550,12,2,172000.0,0,1400.0,10,1,1,7
2,nissan,teana,8300,12,2,223000.0,1,2500.0,8,0,1,3
3,volkswagen,fox,3300,15,2,140000.0,1,1200.0,1,1,1,0
4,nissan,primera,2450,18,1,413000.0,0,2200.0,3,1,1,3


In [113]:
target = df['priceUSD']
dataset = pd.DataFrame(data=dataset[features],columns=features)
target.shape, dataset.shape

((35016,), (35016, 9))

In [114]:
dataset.head()

Unnamed: 0,year,condition,mileage(kilometers),fuel_type,volume(cm3),color,transmission,drive_unit,segment
0,27,2,960015.0,1,2000.0,0,1,1,3
1,12,2,172000.0,0,1400.0,10,1,1,7
2,12,2,223000.0,1,2500.0,8,0,1,3
3,15,2,140000.0,1,1200.0,1,1,1,0
4,18,1,413000.0,0,2200.0,3,1,1,3


In [116]:
from sklearn.preprocessing import MinMaxScaler

In [119]:
def normalize(df,features):
  mms = MinMaxScaler(feature_range=(0,10))
  df[features] = mms.fit_transform(df[features])
  return df

In [120]:
df2 = dataset

In [127]:
data = normalize(df2,['year','mileage(kilometers)','volume(cm3)'])
data

Unnamed: 0,year,condition,mileage(kilometers),fuel_type,volume(cm3),color,transmission,drive_unit,segment
0,4.193548,2,0.960015,1,0.769231,0,1,1,3
1,1.774194,2,0.172000,0,0.461538,10,1,1,7
2,1.774194,2,0.223000,1,1.025641,8,0,1,3
3,2.258065,2,0.140000,1,0.358974,1,1,1,0
4,2.741935,1,0.413000,0,0.871795,3,1,1,3
...,...,...,...,...,...,...,...,...,...
39994,1.774194,2,0.278000,0,0.769231,1,1,1,3
39995,1.774194,2,0.214000,1,1.641026,10,0,0,6
39996,2.903226,2,0.295000,1,0.871795,1,1,1,4
39997,3.225806,2,0.000000,0,0.871795,7,1,3,3


In [128]:
from sklearn.model_selection import train_test_split

In [129]:
X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.10, random_state=42)
X_train.shape, y_train.shape, X_test.shape,  y_test.shape

((31514, 9), (31514,), (3502, 9), (3502,))

In [132]:
from xgboost import XGBRegressor

In [140]:
XGB = XGBRegressor(max_depth=3,learning_rate=0.1,n_estimators=1000,reg_alpha=0.001,reg_lambda=0.000001,n_jobs=-1,min_child_weight=3)
XGB.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=3, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0.001,
             reg_lambda=1e-06, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [141]:
print ("Training score:",XGB.score(X_train,y_train),"Test Score:",XGB.score(X_test,y_test))

Training score: 0.9239268183299446 Test Score: 0.8940838827072066


In [144]:
y_test_pred = XGB.predict(X_test)
y_test_pred

array([1292.5219, 2067.2688, 6732.659 , ..., 9213.895 , 6883.6934,
       5853.883 ], dtype=float32)

In [154]:
dictionary = {
    'year':[5.483871],
     'condition':[2],
      'mileage(kilometers)':[0.444444],
       'fuel_type':[0],
        'volume(cm3)':[0.769231],
         'color':[1],
          'transmission':[1],
           'drive_unit':[1],
            'segment':[4]
}
test = pd.DataFrame(data=dictionary)
test

Unnamed: 0,year,condition,mileage(kilometers),fuel_type,volume(cm3),color,transmission,drive_unit,segment
0,5.483871,2,0.444444,0,0.769231,1,1,1,4


In [157]:
print(XGB.predict(test)[0])

1292.5219
