# Car price Prediction

This project will analyse data about cars and use the metodologi of k-nearest neighbors to predict the price.

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from itertools import combinations

%matplotlib inline

In [2]:
cars = pd.read_csv('imports-85.data', header = None, na_values = '?')

In [3]:
# text with the name of columns

text= """1. symboling: -3, -2, -1, 0, 1, 2, 3.
2. normalized-losses: continuous from 65 to 256.
3. make:
alfa-romero, audi, bmw, chevrolet, dodge, honda,
isuzu, jaguar, mazda, mercedes-benz, mercury,
mitsubishi, nissan, peugot, plymouth, porsche,
renault, saab, subaru, toyota, volkswagen, volvo
4. fuel-type: diesel, gas.
5. aspiration: std, turbo.
6. num-of-doors: four, two.
7. body-style: hardtop, wagon, sedan, hatchback, convertible.
8. drive-wheels: 4wd, fwd, rwd.
9. engine-location: front, rear.
10. wheel-base: continuous from 86.6 120.9.
11. length: continuous from 141.1 to 208.1.
12. width: continuous from 60.3 to 72.3.
13. height: continuous from 47.8 to 59.8.
14. curb-weight: continuous from 1488 to 4066.
15. engine-type: dohc, dohcv, l, ohc, ohcf, ohcv, rotor.
16. num-of-cylinders: eight, five, four, six, three, twelve, two.
17. engine-size: continuous from 61 to 326.
18. fuel-system: 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.
19. bore: continuous from 2.54 to 3.94.
20. stroke: continuous from 2.07 to 4.17.
21. compression-ratio: continuous from 7 to 23.
22. horsepower: continuous from 48 to 288.
23. peak-rpm: continuous from 4150 to 6600.
24. city-mpg: continuous from 13 to 49.
25. highway-mpg: continuous from 16 to 54.
26. price: continuous from 5118 to 45400."""

In [4]:
# changing the columns name
cars.columns = (list(pd.Series(text.split('\n')).str.extract('. ([^2]+):', expand = False).str.replace('-', '_').dropna()))

In [5]:
cars.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [6]:
# droping lines that do not have price
cars.dropna(subset = ['price'], inplace = True)

num_of_doors and num_of_cylinders will be converted to number

In [7]:
word_to_number = {'one': 1, 'two':2, 'three':3, 'four':4, 'five':5,
                 'six':6, 'seven':7, 'eight':8, 'nine':9, 'ten':10,
                 'eleven':11, 'twelve':12}

cars['num_of_doors'] = cars['num_of_doors'].map(word_to_number)
cars['num_of_cylinders'] = cars['num_of_cylinders'].map(word_to_number)

In [8]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 26 columns):
symboling            201 non-null int64
normalized_losses    164 non-null float64
make                 201 non-null object
fuel_type            201 non-null object
aspiration           201 non-null object
num_of_doors         199 non-null float64
body_style           201 non-null object
drive_wheels         201 non-null object
engine_location      201 non-null object
wheel_base           201 non-null float64
length               201 non-null float64
width                201 non-null float64
height               201 non-null float64
curb_weight          201 non-null int64
engine_type          201 non-null object
num_of_cylinders     201 non-null int64
engine_size          201 non-null int64
fuel_system          201 non-null object
bore                 197 non-null float64
stroke               197 non-null float64
compression_ratio    201 non-null float64
horsepower           199 non-

In [9]:
# columns num_of_doors, bore, stroke, horsepower, peak_rpm and horsepower 
# will be set to the average
cars.loc[cars.num_of_doors.isnull(), 'num_of_doors'] = 4
cars.loc[cars.bore.isnull(), 'bore'] = cars.bore.mean()
cars.loc[cars.stroke.isnull(), 'stroke'] = cars.stroke.mean()
cars.loc[cars.peak_rpm.isnull(), 'peak_rpm'] = cars.peak_rpm.mean()
cars.loc[cars.horsepower.isnull(), 'horsepower'] = cars.horsepower.mean()

# Selection of the features and target columns

In [10]:
# normalized_losses was not considered because of the missing values
features = ['symboling', 'num_of_doors', 'wheel_base',
            'length', 'width', 'height', 'curb_weight',
            'num_of_cylinders', 'engine_size', 'bore', 
            'stroke', 'compression_ratio', 'horsepower',
           'peak_rpm', 'city_mpg', 'highway_mpg']

target = ['price']

normalizing values

In [11]:
normalized_cars = (cars[features] - cars[features].mean())/cars[features].std()
normalized_cars['price'] = cars.price

In [97]:
def knn_train_text(train_columns, target_columns, dataset):
    rmse_list = list()
    for random_state in range(100):
        X_train, X_test, y_train, y_test = train_test_split(dataset[train_columns],
                                                           dataset[target_columns],
                                                           test_size = 0.33, random_state = 1)
        rmse = dict()
        for n_neighboors in [1, 3, 5, 7, 9]:
            knn = KNeighborsRegressor(n_neighbors = n_neighboors)
            knn.fit(X_train, y_train)
            prediction = knn.predict(X_test)
            mse = mean_squared_error(prediction, y_test)
            rmse[n_neighboors] = mse**0.5
        min_key = min(rmse, key = rmse.get)
        rmse_list.append(rmse[min_key])
    return np.mean(rmse_list)

In [98]:
total_rmse = knn_train_text(features, target, normalized_cars)

Know we can check if remove some columns could turn the knn more precise. If after remove a column the rmse is less than with all the columns it is removed

In [99]:
column_removed_rmse = dict()
features_selected = features.copy()
columns_removed = list()

while True:
    for item in features_selected:
        new_list = features_selected.copy()
        new_list.remove(item)
        rmse = knn_train_text(new_list, target, normalized_cars)
        column_removed_rmse[item] = rmse
    min_key = min(column_removed_rmse, key = column_removed_rmse.get)
    min_rmse = column_removed_rmse[min_key]
    if min_rmse < total_rmse:
        total_rmse = min_rmse
        features_selected.remove(min_key)
        columns_removed.append(min_key)
    else:
        break
        
columns_removed

['stroke', 'curb_weight']

In [116]:
features_selected_ordered = list(normalized_cars[features_selected + ['price']].corr().price.apply(abs).sort_values(ascending = False).drop('price').index)

checking if using less parameters is better

In [122]:
rmse_number_dict = dict()

for i in range(2, len(features_selected_ordered)):
    columns_selected = features_selected_ordered[:i]
    rmse = knn_train_text(columns_selected, target, normalized_cars)
    rmse_number_dict[i] = rmse

In [123]:
rmse_number_dict

{2: 2820.6076985973077,
 3: 2416.9110108074974,
 4: 2225.177941717277,
 5: 2310.7402825500894,
 6: 2420.140962891488,
 7: 2541.582327937639,
 8: 2734.056949477292,
 9: 2778.021343583649,
 10: 2686.668440983082,
 11: 3183.075778437728,
 12: 3239.39117516517,
 13: 3218.145836019641}

with only 4 columns we had the lesse RMSE

In [127]:
features_selected_ordered[:4]

['engine_size', 'horsepower', 'width', 'num_of_cylinders']

Engine_size, horsepower, width and num_of_cylinders seems to be the factors that influence more in the price