In [78]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error


url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

data = pd.read_csv(url, names=column_names,na_values='?', comment='\t',sep=' ', skipinitialspace=True)

data.head()


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [79]:
df_car = data.copy()
df_car = df_car.dropna()
df_car.shape

(392, 8)

In [80]:
feature_names = [f for f in df_car.columns if f != "MPG" ]

X = df_car[feature_names]  
y = df_car["MPG"]  
model = Pipeline([
    ('linear_regression', LinearRegression())
])
model.fit(X, y)

df_car_copy = df_car.copy()
df_car_copy["predicted_MPG"] = model.predict(X)

This is without one hot coding

In [81]:
print(r2_score(df_car_copy['MPG'], df_car_copy['predicted_MPG']))
print(mean_squared_error(df_car_copy['MPG'],df_car_copy['predicted_MPG']))

0.8214780764810599
10.847480945000449


In [82]:
df_car_traf = df_car.copy()

In [83]:
df_car_traf.loc[:, [1, 2, 3]] = pd.get_dummies(df_car["Origin"]).astype(int)

In [84]:
df_car_traf = df_car_traf.drop(columns="Origin")

In [85]:
df_car_traf.rename(columns=({1:"USA",2:"Europe",3:"Japan"}),inplace=True)

In [86]:
df_car_traf


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,0,0
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,0,0
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,0,0
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,0,0
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,0,0
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,0,0
394,44.0,4,97.0,52.0,2130.0,24.6,82,0,1,0
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,0,0
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,0,0


With one hot coding

In [87]:
feature_names = [f for f in df_car_traf.columns if f != "MPG"]


X = df_car_traf[feature_names]
y = df_car_traf["MPG"]
model = Pipeline([
    ('linear_regression', LinearRegression())
])
model.fit(X, y)
df_car_traf["Predicted_MPG"] = model.predict(X)

print(r2_score(df_car_traf['MPG'],df_car_traf['Predicted_MPG']))
print(mean_squared_error(df_car_traf['MPG'],df_car_traf['Predicted_MPG']))
print()

0.8241994699119171
10.682121627762635

