In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('Automobile price data _Raw_.csv')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
# We know that the column 'normalized-losses' has a lot of null values. We can see it as ?
# but python doesn't recognize it as null values
# as you can see below, all columns show that there are no null values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [4]:
# hence we must replace '?' with null so that we can clean the data properly 

df = df.replace('?', np.nan)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       203 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [5]:
# Now we can see that it has detected null values
# let's remove the 'normalized losses' column
df.pop('normalized-losses')
df.head()

Unnamed: 0,symboling,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [6]:
# there could still be some null values left
df.info()
#observe columns horsepower, peak-rpm, num-of-doors which still have some null values left

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   make               205 non-null    object 
 2   fuel-type          205 non-null    object 
 3   aspiration         205 non-null    object 
 4   num-of-doors       203 non-null    object 
 5   body-style         205 non-null    object 
 6   drive-wheels       205 non-null    object 
 7   engine-location    205 non-null    object 
 8   wheel-base         205 non-null    float64
 9   length             205 non-null    float64
 10  width              205 non-null    float64
 11  height             205 non-null    float64
 12  curb-weight        205 non-null    int64  
 13  engine-type        205 non-null    object 
 14  num-of-cylinders   205 non-null    object 
 15  engine-size        205 non-null    int64  
 16  fuel-system        205 non

In [7]:
# let's remove all the rows that contain null value
df = df.dropna()
df.info()
# now we are left with 193 rows that contain no missing values!

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 204
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          193 non-null    int64  
 1   make               193 non-null    object 
 2   fuel-type          193 non-null    object 
 3   aspiration         193 non-null    object 
 4   num-of-doors       193 non-null    object 
 5   body-style         193 non-null    object 
 6   drive-wheels       193 non-null    object 
 7   engine-location    193 non-null    object 
 8   wheel-base         193 non-null    float64
 9   length             193 non-null    float64
 10  width              193 non-null    float64
 11  height             193 non-null    float64
 12  curb-weight        193 non-null    int64  
 13  engine-type        193 non-null    object 
 14  num-of-cylinders   193 non-null    object 
 15  engine-size        193 non-null    int64  
 16  fuel-system        193 non

In [8]:
# now we will need to ensure that all our data is numeric. 
# For simplicity before we introduce a lot of feature engineering, let's select only numeric columns
numeric_df = df.select_dtypes(include=['int64', 'float64'])
numeric_df.columns

# Why don't we see the price column here? 
# Observe that price's data type is mentioned as object, and not int64 or float64.

Index(['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight',
       'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg'],
      dtype='object')

In [13]:
# now let's begin creating a simple linear regression 
# we need the features and label columns separately
# we also need price in numeric format
y = pd.to_numeric(df['price'])
x = numeric_df
print(x.shape, y.shape)

(193, 10) (193,)


In [14]:
# now let's apply linear regression to our data
# we need to split our data into training and testing.
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)


In [15]:
# we will use training data now to build the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [16]:
# this is where the model will now learn the weights and bias, or coefficients and intercept
model.fit(xtrain, ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
# that's about it! the whole point is that machine learns, not us
# now let's check out our weights and bias
print(model.coef_)

[  87.85180953 -119.09762471  -78.0249138   908.58625636  410.36373646
    2.39479791  127.86407982   98.985933   -376.91947359  260.49502286]


In [19]:
print(model.intercept_)

-65443.78348578896


In [20]:
# This is some very large bias!
# You can see that for our 10 numeric columns, 10 weights and 1 bias has been generated. 
# these will now be used to predict outcomes.
# we can use testing data to calculate predictions, and compare against ytest that we already know 

predictions = model.predict(xtest)

In [22]:
# first 10 known results from ytest
print(ytest[:10])

23      7957
153     6918
189    11595
107    11900
25      6692
158     7898
126    32528
149    11694
100     9549
188     9995
Name: price, dtype: int64


In [23]:
# equivalent predictions from our model
print(predictions[:10])

[ 7173.4691057   8024.64883178 10967.86696674 15051.1268547
  5359.30535724  8748.21985047 23665.80747027  9661.00634694
 11515.02743173 11087.51710868]


In [26]:
# now let's calculate errors using mean_squared_error formula

from sklearn.metrics import mean_absolute_error
mse = mean_absolute_error(ytest, predictions)
print(mse)

2229.9088558034155


In [None]:
# This is indeed a very high error. 
# But given that we went with linear regression and had a lot of bias, this was expected. 
# In the next notebook, we will make this process a little bit better. 
