In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('auto-mpg.csv')

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


Za dalju analizu zanemaricu naziv automobila, a iz tabele dole se vidi da fali horsepower kolona.

In [4]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


Vrednosti u horsepower koloni su stringovi pa ih treba konvertovati u broj

In [5]:
type(df.horsepower[0])

str

In [6]:
df.horsepower = df.horsepower.astype('float64')

ValueError: could not convert string to float: '?'

In [7]:
df = df[df.horsepower != '?']
df.horsepower = df.horsepower.astype('float64')

In [8]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,1.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [10]:
import stats

X = df.iloc[:, 1 : 8].values.astype('float64')
y = df.iloc[:, 0].values.astype('float64')

df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [11]:
for i in range(0, X.shape[1]):
    print('Correlation between Y and X{} is {}'.format(i+1,
          stats.get_pearson_corr(y.tolist(), X[:, i].tolist())))

Correlation between Y and X1 is -0.7776175081260218
Correlation between Y and X2 is -0.8051269467104581
Correlation between Y and X3 is -0.7784267838977751
Correlation between Y and X4 is -0.8322442148315757
Correlation between Y and X5 is 0.4233285369027873
Correlation between Y and X6 is 0.580540966090785
Correlation between Y and X7 is 0.5652087567164613


In [12]:
X, y = stats.standardize(X,y)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
from linear_model import LinearRegression, Ridge, Lasso, ElasticNet

models = []
models.append(('LinearRegression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))

In [17]:
for name, model in models:
    model.fit(X_train, y_train, X_test, y_test)
    predictions = model.predict()
    
    r_sqared = stats.get_r_squared(y_test, predictions)
    msg = '{} => {}'.format(name, r_sqared)
    print(msg)

LinearRegression => 0.8118881423003085
Ridge => 0.7827038041355148
Lasso => 0.8116395255518376
ElasticNet => 0.7826084772741281


In [19]:
X = df.iloc[:, 1 : 7].values.astype('float64')
y = df.iloc[:, 0].values.astype('float64')
X, y = stats.standardize(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Nakon izbacivanja origin kolone, Lasso model je najefikasniji

In [20]:
for name, model in models:
    model.fit(X_train, y_train, X_test, y_test)
    predictions = model.predict()
    
    r_sqared = stats.get_r_squared(y_test, predictions)
    msg = '{} => {}'.format(name, r_sqared)
    print(msg)

LinearRegression => 0.8100139702888306
Ridge => 0.7784487612083177
Lasso => 0.8160895897807755
ElasticNet => 0.7782535884988593


In [21]:
X = df.iloc[:, [4,6]].values.astype('float64')
y = df.iloc[:, 0].values.astype('float64')
X, y = stats.standardize(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
for name, model in models:
    model.fit(X_train, y_train, X_test, y_test)
    predictions = model.predict()
    
    r_sqared = stats.get_r_squared(y_test, predictions)
    msg = '{} => {}'.format(name, r_sqared)
    print(msg)

LinearRegression => 0.8150789318775375
Ridge => 0.8111124366647706
Lasso => 0.8221384067005659
ElasticNet => 0.8112747841787118


Weight i model year su promeniljive koje najvise uticu na predikciju mph

In [23]:
df.iloc[:, [4,6]].head()

Unnamed: 0,weight,model year
0,3504,70
1,3693,70
2,3436,70
3,3433,70
4,3449,70


#### Testiranje sklearn modela

In [24]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

models = []
models.append(('LinearRegression SK', LinearRegression()))
models.append(('Ridge SK', Ridge()))
models.append(('Lasso SK', Lasso()))
models.append(('ElasticNet SK', ElasticNet()))

In [25]:
X = df.iloc[:, [4,6]].values.astype('float64')
y = df.iloc[:, 0].values.astype('float64')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
for name, model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    r_sqared = stats.get_r_squared(y_test, predictions)
    msg = '{} => {}'.format(name, r_sqared)
    print(msg)

LinearRegression SK => 0.8131879450881866
Ridge SK => 0.8131884821864304
Lasso SK => 0.8115404748272447
ElasticNet SK => 0.8120263033765382
