In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

Note I already tried loading in the data and the headers are missing. So since I know what the headers should be I'm just going to hard code them in.

Documentation on dataset is here: https://archive.ics.uci.edu/ml/datasets/automobile

In [2]:
headers = ["Symboling", "normalized", "make", "fuel_type",
          "aspiration", "num_doors", "body_style", 
          "drive_wheels", "engine_location", "wheel_base",
          "length", "width", "height", "curb_weight",
          "engine_type", "num_cylinders", "engine_size",
          "fuel_system", "bore", "stroke", "compression_ratio",
          "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
cars = pd.read_csv("imports-85.data", header=None)
cars.columns = headers

In [3]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
Symboling            205 non-null int64
normalized           205 non-null object
make                 205 non-null object
fuel_type            205 non-null object
aspiration           205 non-null object
num_doors            205 non-null object
body_style           205 non-null object
drive_wheels         205 non-null object
engine_location      205 non-null object
wheel_base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb_weight          205 non-null int64
engine_type          205 non-null object
num_cylinders        205 non-null object
engine_size          205 non-null int64
fuel_system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression_ratio    205 non-null float64
horsepower           205 non-nul

In [4]:
cars.head()

Unnamed: 0,Symboling,normalized,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


Ok so first look at everything. I've got a couple observations:
* There are several columns that show object type but looking at the data they should be floats.
* The last column price is our target column.
* a couple of the columns like "make", "fuel_type" and other could be converted into 1hot encoding to make them more usable.
* From the documentation the symboling column is the riskyness of a car according to insurance with +3 being the riskiest and -3 being the safest.
* I'm still not sure exactly what the normalized column represents. from the docs it represents:

"The third factor is the relative average loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the average loss per car per year."

Now let's get rid of the "?" and replace them with NaN's

In [5]:
cars.replace("?", np.nan, inplace=True)

ok now we'll correct the columns which are clearly numerical but are showing up as objects.

In [6]:
obj_to_num = ["normalized", "price", "horsepower", "peak_rpm",
             "bore", "stroke"]
for col in obj_to_num:
    cars[col] = cars[col].astype(float)

The "num_cylinders" and "num_doors" need to be converted to numerical data types as well.

In [7]:
cars["num_cylinders"].replace({"four": 4, "six": 6, "five": 5,
                              "eight": 8, "two": 2, "three": 3,
                              "twelve": 12}, inplace=True)
cars["num_cylinders"].astype(float, inplace=True)

0      4.0
1      4.0
2      6.0
3      4.0
4      5.0
5      5.0
6      5.0
7      5.0
8      5.0
9      5.0
10     4.0
11     4.0
12     6.0
13     6.0
14     6.0
15     6.0
16     6.0
17     6.0
18     3.0
19     4.0
20     4.0
21     4.0
22     4.0
23     4.0
24     4.0
25     4.0
26     4.0
27     4.0
28     4.0
29     4.0
      ... 
175    4.0
176    4.0
177    4.0
178    6.0
179    6.0
180    6.0
181    6.0
182    4.0
183    4.0
184    4.0
185    4.0
186    4.0
187    4.0
188    4.0
189    4.0
190    4.0
191    5.0
192    4.0
193    4.0
194    4.0
195    4.0
196    4.0
197    4.0
198    4.0
199    4.0
200    4.0
201    4.0
202    6.0
203    6.0
204    4.0
Name: num_cylinders, dtype: float64

In [8]:
cars["num_doors"].replace({"four": 4, "six": 6, "five": 5,
                              "eight": 8, "two": 2, "three": 3,
                              "twelve": 12}, inplace=True)
cars["num_doors"].astype(float, inplace=True)

0      2.0
1      2.0
2      2.0
3      4.0
4      4.0
5      2.0
6      4.0
7      4.0
8      4.0
9      2.0
10     2.0
11     4.0
12     2.0
13     4.0
14     4.0
15     4.0
16     2.0
17     4.0
18     2.0
19     2.0
20     4.0
21     2.0
22     2.0
23     2.0
24     4.0
25     4.0
26     4.0
27     NaN
28     4.0
29     2.0
      ... 
175    4.0
176    4.0
177    4.0
178    2.0
179    2.0
180    4.0
181    4.0
182    2.0
183    2.0
184    4.0
185    4.0
186    4.0
187    4.0
188    4.0
189    2.0
190    2.0
191    4.0
192    4.0
193    4.0
194    4.0
195    4.0
196    4.0
197    4.0
198    4.0
199    4.0
200    4.0
201    4.0
202    4.0
203    4.0
204    4.0
Name: num_doors, dtype: float64

In [9]:
cars["normalized"].isnull().sum()

41

In [10]:
colnames_numerics_only = cars.select_dtypes(include=[np.number]).columns.tolist()

In [11]:
colnames_numerics_only

['Symboling',
 'normalized',
 'num_doors',
 'wheel_base',
 'length',
 'width',
 'height',
 'curb_weight',
 'num_cylinders',
 'engine_size',
 'bore',
 'stroke',
 'compression_ratio',
 'horsepower',
 'peak_rpm',
 'city_mpg',
 'highway_mpg',
 'price']

In [12]:
for col in colnames_numerics_only:
    cars[col].fillna(cars[col].mean(),inplace=True)
min_max_scaler = MinMaxScaler()
cars[colnames_numerics_only] = min_max_scaler.fit_transform(cars[colnames_numerics_only])

In [13]:
cars.head(10)

Unnamed: 0,Symboling,normalized,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,1.0,0.298429,alfa-romero,gas,std,0.0,convertible,rwd,front,0.058309,...,0.260377,mpfi,0.664286,0.290476,0.125,0.2625,0.346939,0.222222,0.289474,0.207959
1,1.0,0.298429,alfa-romero,gas,std,0.0,convertible,rwd,front,0.058309,...,0.260377,mpfi,0.664286,0.290476,0.125,0.2625,0.346939,0.222222,0.289474,0.282558
2,0.6,0.298429,alfa-romero,gas,std,0.0,hatchback,rwd,front,0.230321,...,0.343396,mpfi,0.1,0.666667,0.125,0.441667,0.346939,0.166667,0.263158,0.282558
3,0.8,0.518325,audi,gas,std,1.0,sedan,fwd,front,0.38484,...,0.181132,mpfi,0.464286,0.633333,0.1875,0.225,0.55102,0.305556,0.368421,0.219254
4,0.8,0.518325,audi,gas,std,1.0,sedan,4wd,front,0.373178,...,0.283019,mpfi,0.464286,0.633333,0.0625,0.279167,0.55102,0.138889,0.157895,0.306142
5,0.8,0.298429,audi,gas,std,0.0,sedan,fwd,front,0.38484,...,0.283019,mpfi,0.464286,0.633333,0.09375,0.258333,0.55102,0.166667,0.236842,0.251527
6,0.6,0.486911,audi,gas,std,1.0,sedan,fwd,front,0.559767,...,0.283019,mpfi,0.464286,0.633333,0.09375,0.258333,0.55102,0.166667,0.236842,0.312596
7,0.6,0.298429,audi,gas,std,1.0,wagon,fwd,front,0.559767,...,0.283019,mpfi,0.464286,0.633333,0.09375,0.258333,0.55102,0.166667,0.236842,0.342634
8,0.6,0.486911,audi,gas,turbo,1.0,sedan,fwd,front,0.559767,...,0.264151,mpfi,0.421429,0.633333,0.08125,0.383333,0.55102,0.111111,0.105263,0.465642
9,0.4,0.298429,audi,gas,turbo,0.0,hatchback,4wd,front,0.376093,...,0.264151,mpfi,0.421429,0.633333,0.0,0.466667,0.55102,0.083333,0.157895,0.200813


In [14]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
Symboling            205 non-null float64
normalized           205 non-null float64
make                 205 non-null object
fuel_type            205 non-null object
aspiration           205 non-null object
num_doors            205 non-null float64
body_style           205 non-null object
drive_wheels         205 non-null object
engine_location      205 non-null object
wheel_base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb_weight          205 non-null float64
engine_type          205 non-null object
num_cylinders        205 non-null float64
engine_size          205 non-null float64
fuel_system          205 non-null object
bore                 205 non-null float64
stroke               205 non-null float64
compression_ratio    205 non-null float64
horsepower           

In [15]:
obj_cols = cars.select_dtypes(include=["object"]).columns.tolist()

In [16]:
cars.select_dtypes(include=["object"])

Unnamed: 0,make,fuel_type,aspiration,body_style,drive_wheels,engine_location,engine_type,fuel_system
0,alfa-romero,gas,std,convertible,rwd,front,dohc,mpfi
1,alfa-romero,gas,std,convertible,rwd,front,dohc,mpfi
2,alfa-romero,gas,std,hatchback,rwd,front,ohcv,mpfi
3,audi,gas,std,sedan,fwd,front,ohc,mpfi
4,audi,gas,std,sedan,4wd,front,ohc,mpfi
5,audi,gas,std,sedan,fwd,front,ohc,mpfi
6,audi,gas,std,sedan,fwd,front,ohc,mpfi
7,audi,gas,std,wagon,fwd,front,ohc,mpfi
8,audi,gas,turbo,sedan,fwd,front,ohc,mpfi
9,audi,gas,turbo,hatchback,4wd,front,ohc,mpfi


In [17]:
def one_hot_column(df, label, drop_col=False):
    '''
    This function will one hot encode the chosen column.

    Args:
        df: Pandas dataframe
        label: Label of the column to encode
        drop_col: boolean to decide if the chosen column should be dropped

    Returns:
        pandas dataframe with the given encoding
    '''
    one_hot = pd.get_dummies(df[label], prefix=[label])
    if drop_col:
        df = df.drop(label, axis=1)
    df = df.join(one_hot)
    return df

In [18]:
for col in obj_cols:
    cars = one_hot_column(cars, col, drop_col=True)

In [19]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 69 columns):
Symboling                     205 non-null float64
normalized                    205 non-null float64
num_doors                     205 non-null float64
wheel_base                    205 non-null float64
length                        205 non-null float64
width                         205 non-null float64
height                        205 non-null float64
curb_weight                   205 non-null float64
num_cylinders                 205 non-null float64
engine_size                   205 non-null float64
bore                          205 non-null float64
stroke                        205 non-null float64
compression_ratio             205 non-null float64
horsepower                    205 non-null float64
peak_rpm                      205 non-null float64
city_mpg                      205 non-null float64
highway_mpg                   205 non-null float64
price                       

Ok So now every numeric column has been converted to float, all of the missing values have been filled and and categorical variables have been one-hot-encoded.

In [20]:
corr = cars.corr()
corr["price"].sort_values(ascending=False)

price                         1.000000
engine_size                   0.861752
curb_weight                   0.820825
horsepower                    0.757917
width                         0.728699
num_cylinders                 0.687770
length                        0.682986
['drive_wheels']_rwd          0.632103
wheel_base                    0.583168
bore                          0.532300
['make']_mercedes-benz        0.524741
['fuel_system']_mpfi          0.504631
['engine_type']_ohcv          0.394188
['make']_jaguar               0.332131
['make']_bmw                  0.331473
['engine_location']_rear      0.331013
['make']_porsche              0.293176
['body_style']_hardtop        0.231087
['body_style']_convertible    0.192085
['aspiration']_turbo          0.177285
['engine_type']_dohc          0.155950
['make']_volvo                0.147311
['body_style']_sedan          0.146641
height                        0.134388
normalized                    0.133999
['fuel_type']_diesel     

# Moving on to the Analysis

In [21]:
from sklearn import neighbors
from sklearn.model_selection import train_test_split



def calc_rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())


def knn_train_test(df, train_col, target_col, n):
    # Create test and train data
    X = df[train_col]
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split( X, y,
                                                    test_size=0.3, 
                                                    random_state=42)
    
    #print(X_train.shape, X_test.shape, y_train, y_test)
    # Instantiate and fit KNN
    knn_reg = neighbors.KNeighborsRegressor(n)
    knn_reg.fit(X_train, y_train)
    
    #make predictions and
    predictions = knn_reg.predict(X_test)
    rmse = calc_rmse(predictions, y_test)
    
    return rmse

In [22]:
print(knn_train_test(cars, ["engine_size"], "price", 7))

0.0846083882559


In [66]:
result_cols = ["n", "rmse"]
result_df = pd.DataFrame(index=col_list,
                           columns=result_cols)
result_df.columns.values

array(['n', 'rmse'], dtype=object)

In [69]:
best_rmse = 1000
best_col = ""
best_n = 0
col_list = cars.columns.tolist()
col_list.remove("price")
for col in col_list:
    for n in [1,3,5,7,9]:
        current_rmse = knn_train_test(cars, [col], "price", n)
        if (not result_df.loc[col]["rmse"] 
            or result_df.loc[col]["rmse"] > current_rmse):
            result_df.loc[col]["n"] = n
            result_df.loc[col]["rmse"] = current_rmse
       
        if current_rmse < best_rmse:
            best_col = col
            best_n = n
            best_rmse = current_rmse

print("Best Column : {}\n Best Number of Neighbors: {}\n RMSE: {}".format(best_col,
                                                                         best_n,
                                                                         best_rmse))

Best Column : horsepower
 Best Number of Neighbors: 3
 RMSE: 0.07814373589608724


In [74]:
result_df.sort_values("rmse", axis=0, inplace=True)

In [75]:

result_df.head()

Unnamed: 0,n,rmse
horsepower,3,0.0781437
engine_size,7,0.0846084
wheel_base,1,0.107142
width,1,0.112444
curb_weight,9,0.117411


In [85]:
multi_col_results = []
for i in range(1,len(col_list)):
    current_rmse = knn_train_test(cars, result_df.index.values[:i], "price", n)
    multi_col_results.append((i,current_rmse))

In [86]:
multi_col_results

[(1, 0.14208606436176976),
 (2, 0.12778495795787209),
 (3, 0.12710993623070913),
 (4, 0.12755119601962978),
 (5, 0.13055921130803969),
 (6, 0.12796526266924035),
 (7, 0.12813213736932594),
 (8, 0.12745365905980752),
 (9, 0.1286690572127844),
 (10, 0.13071674352301471),
 (11, 0.13234889153065413),
 (12, 0.13004606860380846),
 (13, 0.12945206533430095),
 (14, 0.13019712517449789),
 (15, 0.1304924794924287),
 (16, 0.13038327603206046),
 (17, 0.13146010987490045),
 (18, 0.13259781372729187),
 (19, 0.13295185872404555),
 (20, 0.13293441181259014),
 (21, 0.13095803069440454),
 (22, 0.13353842527652141),
 (23, 0.13227309898916978),
 (24, 0.13125650735580777),
 (25, 0.13185074424236579),
 (26, 0.13340368226294833),
 (27, 0.13340368226294833),
 (28, 0.13313076004677057),
 (29, 0.13575362925623083),
 (30, 0.13040843616404799),
 (31, 0.13409414869491551),
 (32, 0.13592887488515112),
 (33, 0.13956548761968629),
 (34, 0.13826258881508691),
 (35, 0.13674489500867495),
 (36, 0.13675228272097462),
 (3

In [87]:
final_col_list = result_df.index.values[:21]
dif_k_results = []
for n in range(1,25):
    current_rmse = knn_train_test(cars, final_col_list, "price", n)
    dif_k_results.append((n,current_rmse))

In [88]:
dif_k_results

[(1, 0.069453944669175441),
 (2, 0.065193323852973967),
 (3, 0.073357604045187846),
 (4, 0.08009383215204316),
 (5, 0.087808446214594479),
 (6, 0.095772255923087371),
 (7, 0.10261887338555449),
 (8, 0.10458638774954714),
 (9, 0.10663981853436084),
 (10, 0.11119305805981354),
 (11, 0.11453894830826845),
 (12, 0.11648767294682436),
 (13, 0.11836886084171644),
 (14, 0.12080086919075615),
 (15, 0.12312213704059817),
 (16, 0.12532213810817344),
 (17, 0.12644098999990339),
 (18, 0.12695767341275949),
 (19, 0.12911065538499572),
 (20, 0.12899236702055791),
 (21, 0.12959190453835609),
 (22, 0.12962399496301993),
 (23, 0.12976267124580607),
 (24, 0.13095803069440454)]

Summary so far:
It looks like using the top 21 columns and the 2 nearest neighbors results in the lowest RMSE