In [1]:
from sklearn.linear_model import LinearRegression 
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('auto-mpg.csv')
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
data = data.replace('?', np.nan)
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
data = data.dropna()

In [5]:
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [6]:
data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')


In [7]:
data.drop(['car name'], axis=1, inplace=True)

In [8]:
data['model year'] += 1900

In [9]:
data['age'] = datetime.datetime.now().year - data['model year']
data.drop(['model year'], axis=1, inplace=True)
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,origin,age
0,18.0,8,307.0,130,3504,12.0,1,50
1,15.0,8,350.0,165,3693,11.5,1,50
2,18.0,8,318.0,150,3436,11.0,1,50
3,16.0,8,304.0,150,3433,12.0,1,50
4,17.0,8,302.0,140,3449,10.5,1,50
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,1,38
394,44.0,4,97.0,52,2130,24.6,2,38
395,32.0,4,135.0,84,2295,11.6,1,38
396,28.0,4,120.0,79,2625,18.6,1,38


In [10]:
data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower        int64
weight            int64
acceleration    float64
origin            int64
age               int64
dtype: object

In [11]:
data.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,origin,age
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,1.576531,44.020408
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,0.805518,3.683737
min,9.0,3.0,68.0,46.0,1613.0,8.0,1.0,38.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,1.0,41.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,1.0,44.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,2.0,47.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,3.0,50.0


In [12]:
x = data.drop(['mpg'], axis=1)
y = data['mpg']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [14]:
model = LinearRegression(normalize=True).fit(x_train, y_train)
print('Training Score:', model.score(x_train, y_train))

Training Score: 0.8192992298286901


In [15]:
y_pred = model.predict(x_test)
print('Test Score:', r2_score(y_test, y_pred))

Test Score: 0.8225072883164073
