# Objective

Using this toy dataset, I would like to generate a predictive model of MPG.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('mpg.csv')

print(df.head())

In [None]:
print(df.info())

I found some missing values which in place had a question mark. I would like to replace these with the value -99999.

In [None]:
df['horsepower'] = df['horsepower'].replace('?',-99999)
df['horsepower'] = df['horsepower'].astype('int64')
print(df.info())
print(df.describe())

# EDA

In [None]:
corr = df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True)
plt.title("Correlation Heatmap")
plt.show()

We see much of the variables strongly correlated with each other in this heatmap. So I think it might be sufficient to model this with linear regression.

In [None]:
print(corr)

In [None]:
for i in range(len(corr.index)):
    for j in range(len(corr.columns)):
        if (np.abs(corr.iloc[i,j]) >= 0.8) & (np.abs(corr.iloc[i,j]) !=1):
            print(corr.index.values[i],corr.columns[j],corr.iloc[i,j])
            plt.figure(i)
            x = list(df[corr.index.values[i]])
            y = list(df[corr.columns[j]])
            plt.scatter(x,y)
            plt.show()

Here are the graphs of pairs of variables with the absolute value of their correlation coefficients greater than 0.8. 

- mpg and displacement: mpg increases, displacement decreases.
- mpg and weight: mgp increases, weight decreases
- cyclinders and displacement: cylinders increases, displacement increases
- cylinders and weight: cylinders increases, weight increases
- displacement and weight: displacement increases, weight increases
- weight and mpg: weight increases, mpg decreases

# Modeling

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVR

df = df[df['horsepower'] != -99999]
#f = df.drop(['horsepower'],1)

print(df.describe())

X = np.array(df.drop(['mpg'],1))
X = preprocessing.scale(X)

y = np.array(df['mpg'])

# Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)
acc = lr.score(X_test,y_test)
print(acc)

In [None]:
preds = lr.predict(X_test)

plt.figure()
plt.plot(range(X_test.shape[0]), preds, 'b', label='preds')
plt.plot(range(X_test.shape[0]), y_test, 'r', label='actual')
plt.legend()
plt.show()

# K Neighbors Regressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=5, p=1)
knn.fit(X_train,y_train)
acc = knn.score(X_test,y_test)
print(acc)

In [None]:
preds = knn.predict(X_test)

plt.figure()
plt.plot(range(X_test.shape[0]), preds, 'b', label='preds')
plt.plot(range(X_test.shape[0]), y_test, 'r', label='actual')
plt.legend()
plt.show()

# Support Vector Regressor

In [None]:
sv = SVR(C=1e5, kernel='linear')
sv.fit(X_train, y_train)
acc = sv.score(X_test, y_test)
print(acc)

In [None]:
preds = sv.predict(X_test)

plt.figure()
plt.plot(range(X_test.shape[0]), preds, 'b', label='preds')
plt.plot(range(X_test.shape[0]), y_test, 'r', label='actual')
plt.legend()
plt.show()

# Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam, SGD
from keras.callbacks import EarlyStopping

in_shape = (X_train.shape[1],)

model = Sequential()
model.add(Dense(20, activation='sigmoid', input_shape=in_shape))
#model.add(Dropout(0.2))
model.add(Dense(20, activation='sigmoid'))
model.add(Dense(1))

early_stopping = EarlyStopping(monitor='loss', patience=5)
model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.001))
mod_hist = model.fit(X_train, y_train, epochs=1000, batch_size=1, callbacks=[early_stopping], verbose=1, validation_data=(X_test,y_test))

In [None]:
plt.figure()
plt.plot(range(X_test.shape[0]), model.predict(X_test), 'b', label='predicted')
plt.plot(range(X_test.shape[0]), y_test, 'r', label='actual')
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(mod_hist.history['val_loss'], c='b', label='val_loss')
plt.plot(mod_hist.history['loss'], c='r', label='loss')
plt.legend()
plt.show()

# Random Forest Regressor