# CAR MPG MODEL ANALYSIS

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']
cars = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', 
                   sep='\s+', 
                   names=column_names)

In [3]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino


In [4]:
cars.year.value_counts()

73.0    40
78.0    36
70.0    35
76.0    34
82.0    31
81.0    30
75.0    30
80.0    29
79.0    29
71.0    29
77.0    28
72.0    28
74.0    27
Name: year, dtype: int64

In [5]:
cars.isnull().sum()

mpg             8
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [6]:
cars.shape

(406, 9)

In [7]:
cars.dropna(inplace=True)

In [8]:
cars.shape

(392, 9)

In [9]:
# ignore the last car names column. Skip the mpg column - the first column
X = cars.iloc[:, 1:-1]
y = cars.iloc[:, 0]

## MODEL SELECTION

### Linear Model without Penalty

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score

lm = LinearRegression()
lm.fit(X, y)
print (dict(zip(column_names[1:-1], lm.coef_)))
print (lm.intercept_)

scores = cross_val_score(lm, X, y, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print (rmse_scores.mean())

### Linear model using Regression / Shrinkage

In [None]:
from sklearn.linear_model import Lasso
from sklearn import preprocessing

In [None]:
lasso = Lasso(alpha=0.1)
lasso.fit(X, y)

print (dict(zip(column_names[1:-1], lasso.coef_)))
print (lasso.intercept_)

scores = cross_val_score(lasso, X, y, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print (rmse_scores.mean())

In [None]:
# Units are different. Preprocess data to bring them back to the same Standard Scale
X_scaled = preprocessing.scale(X)

lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

print (dict(zip(column_names[1:-1], lasso.coef_)))
print (lasso.intercept_)

scores = cross_val_score(lasso, X_scaled, y, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print (rmse_scores.mean())

In [None]:
cars['prediction'] = lasso.predict(X_scaled)

In [None]:
cars.head()

In [None]:
# create dummy variables for 'continent' and exclude first dummy column
car_name_dummies = pd.get_dummies(cars['name'], prefix='cat').iloc[:, 1:]

In [None]:
car_name_dummies.head()

## Include Car Names

In [None]:
# concatenate two DataFrames (axis=0 for rows, axis=1 for columns)
cars3 = pd.concat([cars, car_name_dummies], axis=1)

In [None]:
del cars3['name']

In [None]:
# ignore the last car names column. Skip the mpg column - the first column
X = cars3.iloc[:, 1:]
y = cars3.iloc[:, 0]

# Units are different. Preprocess data to bring them back to the same Standard Scale
X_scaled = preprocessing.scale(X)

lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

print (dict(zip(column_names[1:-1], lasso.coef_)))
print (lasso.intercept_)

scores = cross_val_score(lasso, X_scaled, y, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print (rmse_scores.mean())

In [None]:
len(X.values[0])

## MODEL SELECTION
### Nearest Neighbors, RandomForest, XGBoost

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# ignore the last car names column. Skip the mpg column - the first column
X = cars3.iloc[:, 1:]
y = cars3.iloc[:, 0]

# Units are different. Preprocess data to bring them back to the same Standard Scale
X_scaled = preprocessing.scale(X)

knn = KNeighborsRegressor(n_neighbors=10)

scores = cross_val_score(knn, X_scaled, y, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print (rmse_scores.mean())

In [None]:
# ignore the last car names column. Skip the mpg column - the first column
X = cars3.iloc[:, 1:]
y = cars3.iloc[:, 0]

rf = RandomForestRegressor()

scores = cross_val_score(rf, X, y, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print (rmse_scores.mean())

In [None]:
import xgboost

In [None]:
xr = xgboost.XGBRegressor()
scores = cross_val_score(xr, X, y, cv=10, scoring='neg_mean_squared_error')
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print (rmse_scores.mean())