In [3]:
#in the last notebook, we have already performed some steps to remove missing values.
# let's quickly do all of it again
import numpy as np
import pandas as pd
df = pd.read_csv('Automobile price data _Raw_.csv')

df = df.replace('?', np.nan)
df.pop('normalized-losses')
df = df.dropna()
y = pd.to_numeric(df['price'])
x = df.iloc[:, :-1] 
print(x.columns)

Index(['symboling', 'make', 'fuel-type', 'aspiration', 'num-of-doors',
       'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length',
       'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',
       'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio',
       'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg'],
      dtype='object')


In [4]:
# let's see which columns are non numeric
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 204
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          193 non-null    int64  
 1   make               193 non-null    object 
 2   fuel-type          193 non-null    object 
 3   aspiration         193 non-null    object 
 4   num-of-doors       193 non-null    object 
 5   body-style         193 non-null    object 
 6   drive-wheels       193 non-null    object 
 7   engine-location    193 non-null    object 
 8   wheel-base         193 non-null    float64
 9   length             193 non-null    float64
 10  width              193 non-null    float64
 11  height             193 non-null    float64
 12  curb-weight        193 non-null    int64  
 13  engine-type        193 non-null    object 
 14  num-of-cylinders   193 non-null    object 
 15  engine-size        193 non-null    int64  
 16  fuel-system        193 non

In [7]:
# let's look at number of unique values in each column
for cname in x.columns:
    unique_values = x[cname].value_counts().count()
    print('Column Name: ' + cname + ', unique values = ' + str(unique_values))
    

Column Name: make, unique values = 21
Column Name: fuel-type, unique values = 2
Column Name: aspiration, unique values = 2
Column Name: num-of-doors, unique values = 2
Column Name: body-style, unique values = 5
Column Name: drive-wheels, unique values = 3
Column Name: engine-location, unique values = 2
Column Name: engine-type, unique values = 5
Column Name: num-of-cylinders, unique values = 6
Column Name: fuel-system, unique values = 7
Column Name: bore, unique values = 38
Column Name: stroke, unique values = 36
Column Name: horsepower, unique values = 56
Column Name: peak-rpm, unique values = 21


In [9]:
# we only need to worry about non-numeric columns, so let's refine our loop
for cname in x.columns:
    if x[cname].dtype == 'object':
        unique_values = x[cname].value_counts().count()
        print('Column Name: ' + cname + ', unique values = ' + str(unique_values))



Column Name: make, unique values = 21
Column Name: fuel-type, unique values = 2
Column Name: aspiration, unique values = 2
Column Name: num-of-doors, unique values = 2
Column Name: body-style, unique values = 5
Column Name: drive-wheels, unique values = 3
Column Name: engine-location, unique values = 2
Column Name: engine-type, unique values = 5
Column Name: num-of-cylinders, unique values = 6
Column Name: fuel-system, unique values = 7
Column Name: bore, unique values = 38
Column Name: stroke, unique values = 36
Column Name: horsepower, unique values = 56
Column Name: peak-rpm, unique values = 21


In [13]:
x.select_dtypes(include=['object']).head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system,bore,stroke,horsepower,peak-rpm
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi,3.47,2.68,111,5000
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi,3.47,2.68,111,5000
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi,2.68,3.47,154,5000
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi,3.19,3.4,102,5500
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi,3.19,3.4,115,5500


In [15]:
# some columns seem to be numbers disguised as objects! let's convert them to numbers
x['bore'] = pd.to_numeric(x['bore'])
x['stroke'] = pd.to_numeric(x['stroke'])
x['horsepower'] = pd.to_numeric(x['horsepower'])
x['peak-rpm'] = pd.to_numeric(x['peak-rpm'])
x.info()
# we now have more numeric columns 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 204
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          193 non-null    int64  
 1   make               193 non-null    object 
 2   fuel-type          193 non-null    object 
 3   aspiration         193 non-null    object 
 4   num-of-doors       193 non-null    object 
 5   body-style         193 non-null    object 
 6   drive-wheels       193 non-null    object 
 7   engine-location    193 non-null    object 
 8   wheel-base         193 non-null    float64
 9   length             193 non-null    float64
 10  width              193 non-null    float64
 11  height             193 non-null    float64
 12  curb-weight        193 non-null    int64  
 13  engine-type        193 non-null    object 
 14  num-of-cylinders   193 non-null    object 
 15  engine-size        193 non-null    int64  
 16  fuel-system        193 non

In [19]:
# let's check out which remaining columns are still objects, and how many unique values are inside them
# we will also save the names of all the columns which are object

object_columns = []

for cname in x.columns:
    if x[cname].dtype == 'object':
        object_columns.append(cname)
        unique_values = x[cname].value_counts().count()
        print('Column Name: ' + cname + ', unique values = ' + str(unique_values))

print('Object columns are: ')
print(object_columns)

Column Name: make, unique values = 21
Column Name: fuel-type, unique values = 2
Column Name: aspiration, unique values = 2
Column Name: num-of-doors, unique values = 2
Column Name: body-style, unique values = 5
Column Name: drive-wheels, unique values = 3
Column Name: engine-location, unique values = 2
Column Name: engine-type, unique values = 5
Column Name: num-of-cylinders, unique values = 6
Column Name: fuel-system, unique values = 7
Object columns are: 
['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']


In [20]:
# since these all seem to be categorical data now, let's apply one hot encoding to these columns

x_encoded = pd.get_dummies(x, columns=object_columns)
x_encoded.columns

Index(['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight',
       'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower',
       'peak-rpm', 'city-mpg', 'highway-mpg', 'make_alfa-romero', 'make_audi',
       'make_bmw', 'make_chevrolet', 'make_dodge', 'make_honda', 'make_isuzu',
       'make_jaguar', 'make_mazda', 'make_mercedes-benz', 'make_mercury',
       'make_mitsubishi', 'make_nissan', 'make_peugot', 'make_plymouth',
       'make_porsche', 'make_saab', 'make_subaru', 'make_toyota',
       'make_volkswagen', 'make_volvo', 'fuel-type_diesel', 'fuel-type_gas',
       'aspiration_std', 'aspiration_turbo', 'num-of-doors_four',
       'num-of-doors_two', 'body-style_convertible', 'body-style_hardtop',
       'body-style_hatchback', 'body-style_sedan', 'body-style_wagon',
       'drive-wheels_4wd', 'drive-wheels_fwd', 'drive-wheels_rwd',
       'engine-location_front', 'engine-location_rear', 'engine-type_dohc',
       'engine-type_l', 'engine-type_ohc', '

In [21]:
# now let's perform linear regression to see if anything got better

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_encoded, y, test_size=0.2)

In [23]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain, ytrain)
predictions = model.predict(xtest)

In [24]:
# first 10 predictions
predictions[:10]

array([ 2741.884583  ,  8771.65481995, 35736.56112549, 14202.02311318,
       32250.        ,  7196.32754566, 14795.76756677, 16507.64243413,
        8183.16288578, 23691.61738617])

In [25]:
#known results
ytest[:10]

30      6479
185     8195
71     34184
197    16515
48     35550
51      6095
124    12764
113    16695
142     7775
13     21105
Name: price, dtype: int64

In [26]:
# now let's calculate errors using mean_absolute_error formula

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(ytest, predictions)
print(mae)

1752.3532464168115


In [27]:
# there is a good chance that our error is lower than before, but still high
# let's apply normalization technique called min-max to all the numerical columns to see if it makes anything better

numeric_columns = x.select_dtypes(include=['int64', 'float64']).columns
print(numeric_columns)

Index(['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight',
       'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower',
       'peak-rpm', 'city-mpg', 'highway-mpg'],
      dtype='object')


In [28]:
# formula for min max is -> for each column, 
# (data - min) / (max - min)
# as a result, all values will be normalized into a scale of 0 to 1

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_encoded[numeric_columns] = scaler.fit_transform(x_encoded[numeric_columns])
x_encoded.head(10)
# now all the values are scaled!

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,...,num-of-cylinders_six,num-of-cylinders_three,num-of-cylinders_twelve,fuel-system_1bbl,fuel-system_2bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,1.0,0.058309,0.413433,0.324786,0.083333,0.411171,0.260377,0.664286,0.290476,0.125,...,0,0,0,0,0,0,0,1,0,0
1,1.0,0.058309,0.413433,0.324786,0.083333,0.411171,0.260377,0.664286,0.290476,0.125,...,0,0,0,0,0,0,0,1,0,0
2,0.6,0.230321,0.449254,0.444444,0.383333,0.517843,0.343396,0.1,0.666667,0.125,...,1,0,0,0,0,0,0,1,0,0
3,0.8,0.38484,0.529851,0.504274,0.541667,0.329325,0.181132,0.464286,0.633333,0.1875,...,0,0,0,0,0,0,0,1,0,0
4,0.8,0.373178,0.529851,0.521368,0.541667,0.518231,0.283019,0.464286,0.633333,0.0625,...,0,0,0,0,0,0,0,1,0,0
5,0.8,0.38484,0.540299,0.512821,0.441667,0.395268,0.283019,0.464286,0.633333,0.09375,...,0,0,0,0,0,0,0,1,0,0
6,0.6,0.559767,0.770149,0.948718,0.658333,0.525989,0.283019,0.464286,0.633333,0.09375,...,0,0,0,0,0,0,0,1,0,0
7,0.6,0.559767,0.770149,0.948718,0.658333,0.568658,0.283019,0.464286,0.633333,0.09375,...,0,0,0,0,0,0,0,1,0,0
8,0.6,0.559767,0.770149,0.948718,0.675,0.61986,0.264151,0.421429,0.633333,0.08125,...,0,0,0,0,0,0,0,1,0,0
10,0.8,0.425656,0.532836,0.384615,0.541667,0.351823,0.177358,0.685714,0.347619,0.1125,...,0,0,0,0,0,0,0,1,0,0


In [36]:
# now let's try linear regression again
xtrain2, xtest2, ytrain2, ytest2 = train_test_split(x_encoded, y, test_size=0.2)

In [37]:
model2 = LinearRegression()
model2.fit(xtrain2, ytrain2)
predictions2 = model.predict(xtest2)

In [38]:
predictions2[:10]

array([ 7616., 10528., 15296., 14848.,  8512.,  9472., 12544.,  8992.,
       28416.,  6144.])

In [39]:
ytest2[:10]

156     6938
87      9279
195    13415
66     18344
160     7738
173     8948
107    11900
80      9959
68     28248
97      7999
Name: price, dtype: int64

In [40]:
mae2 = mean_absolute_error(ytest2, predictions2)
print(mae2)

# While we don't have any guarrantee, but is this error lower than before?
# if yes, we were moving in the right direction!

1190.3076923076924
