In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


import warnings
warnings.filterwarnings('ignore')

In [40]:
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [41]:
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [42]:
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

In [43]:
acc_ix, hpower_ix, cyl_ix = 4,2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]

In [44]:
def num_pipeline_transformer(data):

    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
 
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [45]:
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [46]:
from sklearn.linear_model import LinearRegression 

lin_reg=LinearRegression()
lin_reg.fit(prepared_data,data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [48]:
sample_data=data.iloc[:5]
sample_labels=data_labels.iloc[:5]

sample_data_prepared=pipeline_transformer(sample_data)

print("Prediction of samples:",lin_reg.predict(sample_data_prepared))

Prediction of samples: [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [49]:
print("Actual labels: " ,list(sample_labels))

Actual labels:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [50]:
from sklearn.metrics import mean_squared_error

mpg_predictions=lin_reg.predict(prepared_data)
lin_mse=mean_squared_error(data_labels,mpg_predictions)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

2.959040222576087

In [51]:
from sklearn.tree import DecisionTreeRegressor

tree_reg=DecisionTreeRegressor()
tree_reg.fit(prepared_data,data_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [52]:
mpg_predictions=tree_reg.predict(prepared_data)
tree_mse=mean_squared_error(data_labels,mpg_predictions)
tree_rmse=np.sqrt(tree_mse)
tree_rmse

0.0

In [59]:
from sklearn.model_selection import cross_val_score

scores=cross_val_score(tree_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)

tree_reg_rmse_scores=np.sqrt(-scores)

In [60]:
tree_reg_rmse_scores

array([3.17125764, 3.1302456 , 3.24384032, 3.34150041, 2.3002038 ,
       3.15530308, 3.13603053, 4.92706175, 4.22649096, 2.61533937])

In [61]:
tree_reg_rmse_scores.mean()

3.324727343991969

In [63]:
scores=cross_val_score(lin_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)

lin_reg_rmse_scores=np.sqrt(-scores)
lin_reg_rmse_scores.mean()

3.0757081793709324

In [65]:
from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
forest_reg.fit(prepared_data,data_labels)
forest_reg_cv_scores=cross_val_score(forest_reg,
                                    prepared_data,
                                    data_labels,
                                    scoring="neg_mean_squared_error",
                                    cv=10)
forest_reg_rmse_scores=np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

2.744861992746134

In [68]:
from sklearn.svm import SVR 
svm_reg=SVR(kernel='linear')
svm_reg.fit(prepared_data,data_labels)
svm_cv_scores=cross_val_score(svm_reg,
                              prepared_data,
                              data_labels,
                              scoring='neg_mean_squared_error',
                              cv=10)
svm_rmse_scores=np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.08659162080283

In [72]:
from sklearn.model_selection import GridSearchCV

param_grid=[{'n_estimators':[3,10,20],'max_features':[2,4,6,8]},
            {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}]

forest_reg=RandomForestRegressor()
grid_search=GridSearchCV(forest_reg,
                        param_grid,
                        scoring="neg_mean_squared_error",
                        return_train_score=True,
                        cv=10
                        )

grid_search.fit(prepared_data,data_labels)
grid_search.best_params_

{'max_features': 8, 'n_estimators': 20}

In [75]:
cv_scores=grid_search.cv_results_
for mean_score,params in zip(cv_scores["mean_test_score"],cv_scores["params"]):
    print(np.sqrt(-mean_score),params) 

3.4271922037023397 {'max_features': 2, 'n_estimators': 3}
3.015931480888516 {'max_features': 2, 'n_estimators': 10}
2.869988988211075 {'max_features': 2, 'n_estimators': 20}
3.270041148442743 {'max_features': 4, 'n_estimators': 3}
2.8650441510422455 {'max_features': 4, 'n_estimators': 10}
2.792422093416169 {'max_features': 4, 'n_estimators': 20}
3.0856965842743924 {'max_features': 6, 'n_estimators': 3}
2.858039526862237 {'max_features': 6, 'n_estimators': 10}
2.807609339705671 {'max_features': 6, 'n_estimators': 20}
3.044250288381176 {'max_features': 8, 'n_estimators': 3}
2.7683944555708133 {'max_features': 8, 'n_estimators': 10}
2.7444422827591453 {'max_features': 8, 'n_estimators': 20}
3.1021728819969634 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.8865625571302647 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
2.9713346310820037 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.794668610320271 {'bootstrap': False, 'max_features': 3, 'n_estimat

In [76]:
feature_importance=grid_search.best_estimator_.feature_importances_
feature_importance 

array([0.11904722, 0.41804123, 0.07144328, 0.16355269, 0.01211871,
       0.13407222, 0.03067987, 0.04675871, 0.00137187, 0.00116865,
       0.00174554])

In [78]:
extra_attrs=["acc_on_power","acc_on_cycle"] 
numerics=["float64",'int64']
num_attrs=list(data.select_dtypes(include=numerics))
attrs=num_attrs+extra_attrs
sorted(zip(attrs,feature_importance),reverse=True)

[('acc_on_power', 0.030679867649752383),
 ('acc_on_cycle', 0.04675870997381959),
 ('Weight', 0.16355269416644022),
 ('Model Year', 0.13407221798464355),
 ('Horsepower', 0.0714432834987222),
 ('Displacement', 0.41804123157127576),
 ('Cylinders', 0.11904721703481577),
 ('Acceleration', 0.012118710451786058)]

In [80]:
final_model=grid_search.best_estimator_ 

X_test=strat_test_set.drop("MPG",axis=1)
y_test=strat_test_set["MPG"].copy()

X_test_preprocessed=preprocess_origin_cols(X_test)
X_test_prepared=pipeline_transformer(X_test_preprocessed)

final_predictions=final_model.predict(X_test_prepared)
final_mse=mean_squared_error(y_test,final_predictions)
final_rmse=np.sqrt(final_mse)
final_rmse

3.0070866299460017

In [90]:
def predict_mpg(config,model):
    if type(config)==dict:
        df=pd.DataFrame(config) 
    else:
        df=config 
    
    preproc_df=preprocess_origin_cols(df) 
    prepared_df=pipeline_transformer(preproc_df)
  ##  print(prepared_df)
    y_predict=model.predict(prepared_df)
    return y_predict


In [91]:
##random sample data

vehicle_config={
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config,final_model)

array([33.45 , 18.285, 19.345])

In [92]:
##save model
import pickle

In [95]:
with open("model.bin","wb") as f_out:
    pickle.dump(final_model,f_out)
    f_out.close()

In [96]:
with open("model.bin","rb") as f_in:
    model=pickle.load(f_in)
    
predict_mpg(vehicle_config,model)    

array([33.45 , 18.285, 19.345])