In [97]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import BayesianRidge


In [94]:
df = pd.read_csv('./auto_1993_adj.csv')
# df.isnull().sum() 6 rows missing horsepower value. Drop those rows
df_dropnull = df.dropna().reset_index(drop=True)

df_dropnull.duplicated().value_counts() # No duplicates

cat_columns = ["ID", "model_year", "origin"] # Remove non-quantity features prior to scaling
cat_df = df_dropnull[cat_columns].reset_index(drop=True)
df_subset = df_dropnull.drop(columns=cat_columns) 

scaler = preprocessing.StandardScaler()
df_scaled_vals = scaler.fit_transform(df_subset)
df_scaled = pd.DataFrame(df_scaled_vals, columns=df_subset.columns)

df_final = pd.concat([cat_df, df_scaled], axis=1) # Restore non-quantity features
df_final = df_final.drop(columns="ID")

print(df_final.head())
# df_scaled.hist(bins=50, figsize=(15,10))
# corr_matrix = df_scaled.corr()
# plt.figure(figsize=(12,10))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
# plt.show()

   model_year  origin  displacement  cylinders  horsepower    weight  \
0          70       1      1.077290   1.483947    0.664133  0.620540   
1          70       1      1.488732   1.483947    1.574594  0.843334   
2          70       1      1.182542   1.483947    1.184397  0.540382   
3          70       1      1.048584   1.483947    1.184397  0.536845   
4          70       1      1.029447   1.483947    0.924265  0.555706   

   acceleration       mpg  
0     -1.285258 -0.698638  
1     -1.466724 -1.083498  
2     -1.648189 -0.698638  
3     -1.285258 -0.955212  
4     -1.829655 -0.826925  


In [99]:
X = df_final.drop(["mpg"], axis=1)
y = df_final.mpg

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=16)

# Feature Set 1: year, origin, displacement, cylinders, horsepower, weight, accel.
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

lsvr_model = LinearSVR()
lsvr_model.fit(X_train, y_train)
lsvr_pred = lsvr_model.predict(X_test)

ridge_model = BayesianRidge()
ridge_model.fit(X_train, y_train)
ridge_pred = ridge_model.predict(X_test)

# Metrics
mae_knn = mean_absolute_error(y_test, knn_pred)
mse_knn = mean_squared_error(y_test, knn_pred)
r2_knn = r2_score(y_test, knn_pred)

mae_svr = mean_absolute_error(y_test, lsvr_pred)
mse_svr = mean_squared_error(y_test, lsvr_pred)
r2_svr = r2_score(y_test, lsvr_pred)

mae_ridge = mean_absolute_error(y_test, ridge_pred)
mse_ridge = mean_squared_error(y_test, ridge_pred)
r2_ridge = r2_score(y_test, ridge_pred)

print("MAE, MSE, R^2")
print("KNN")
print(mae_knn, mse_knn, r2_knn)
print("LinearSVR")
print(mae_svr, mse_svr, r2_svr)
print("Ridge")
print(mae_ridge, mse_ridge, r2_ridge)

0    -0.698638
1    -1.083498
2    -0.698638
3    -0.955212
4    -0.826925
5    -1.083498
6    -1.211785
7    -1.211785
8    -1.211785
9    -1.083498
10   -1.083498
11   -1.211785
12   -1.083498
13   -1.211785
14    0.071081
15   -0.185492
16   -0.698638
17   -0.313779
18    0.455941
19    0.327654
Name: mpg, dtype: float64
MAE, MSE, R^2
KNN
0.29762493648293814 0.16451838760677595 0.8631842914901408
LinearSVR
0.541917006878063 0.41926094366303535 0.6513369484578646
Ridge
0.34405096783046735 0.21606966964087823 0.8203135505432682


