In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading the .ldata file using pandas

columns = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names = columns, na_values = "?", comment = '\t', sep = " ", skipinitialspace = True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index, in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [3]:
#separate feature and target variable
data = strat_train_set.drop("MPG", axis = 1)
data_labels = strat_train_set["MPG"].copy()

data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [4]:
#Preprocess the Origin column in data
def preprocess_origin_column(df):
    df["Origin"] = df["Origin"].map(
        {
            1: "India",
            2: "USA",
            3: "Germany"
        }
    )
    return df

In [5]:
#creating custom attribute class 

acceleration_index, horsepower_index, cylinder_index = 4, 2, 0

class CustomAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acceleration_on_power = True):
        self.acceleration_on_power = acceleration_on_power
    def fit(self, X, y = None):
        return self
    def transform (self, X):
        acceleration_on_cylinder = X[:, acceleration_index] / X[:, cylinder_index]
        if self.acceleration_on_power == True:
            acceleration_on_power = X[:, acceleration_index] / X[:, horsepower_index]
            return np.c_[X, acceleration_on_power, acceleration_on_cylinder]
        return np.c[X, acceleration_on_cylinder]

In [6]:
#creating the pipeline

def numerical_pipeline_transformer(data):
    '''
    Processes numerical transformations of the dataframe and calls the pipeline class and creates a pipeline.
    Imputes missing values with the median value
    Adds custom attribute acceleration_on_power and acceleration_on_cylinder
    Scales numerical attributes with StandardScaler

    '''
    numerics = ['float64', 'int64']

    numerical_attributes = data.select_dtypes(include = numerics)

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy= "median")),
        ("attrs_adder", CustomAttributeAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return numerical_attributes, numerical_pipeline

In [7]:
def pipeline_transformer (data):
    '''
    Completes transformation pipeline for numerical and categorical data.
    One-hot encodes categorical data
    '''
    categorical_attributes = ["Origin"]
    numerical_attributes, numerical_pipeline = numerical_pipeline_transformer(data)

    full_pipeline = ColumnTransformer([
        ("num", numerical_pipeline, list(numerical_attributes)),
        ("cat", OneHotEncoder(), categorical_attributes),
    ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [8]:
#from raw data to processed data
preprocessed_df = preprocess_origin_column(data)
prepared_data = pipeline_transformer(preprocessed_df)

In [9]:
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [10]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

In [11]:
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,USA
156,8,400.0,170.0,4668.0,11.5,75,India
395,4,135.0,84.0,2295.0,11.6,82,India
14,4,113.0,95.0,2372.0,15.0,70,Germany


In [12]:
#selecting and training models (Linear Regression, Decision Tree, Random Forest, SVM regressor)

In [13]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
#testing predictions with the Linear Regression Model
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [15]:
#ctual label of the samples
print("Actual label of samples: ", list(sample_labels))

Actual label of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [16]:
#Root Mean Squared Error
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions) #these are the means squared error
lin_rmse = np.sqrt(lin_mse) #this finds the root meansquare error
lin_rmse

2.9590402225760872

In [17]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor

In [18]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [19]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse 

0.0

In [20]:
#The zero indicates the model overfits the data

In [21]:
#Model evaluation using cross-validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    tree_reg,
    prepared_data,
    data_labels,
    scoring = "neg_mean_squared_error",
    cv = 10
)

tree_reg_rmse_scores = np.sqrt(-scores)

In [22]:
tree_reg_rmse_scores

array([2.90032326, 3.12179836, 3.26668793, 3.32387199, 2.26460537,
       3.13119386, 3.57368857, 4.99096058, 4.2603574 , 2.59403783])

In [23]:
tree_reg_rmse_scores.mean()

3.3427525144561456

In [24]:
scores = cross_val_score( 
     lin_reg,
     prepared_data,
     data_labels,
     scoring = "neg_mean_squared_error",
     cv = 10
)

lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [25]:
lin_reg_rmse_scores.mean()

3.075708179370933

In [26]:
#Random Forest Model
from sklearn.ensemble import RandomForestRegressor

In [27]:
forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score( 
    forest_reg,
    prepared_data,
    data_labels,
    scoring = 'neg_mean_squared_error',
    cv = 10
)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)


In [28]:
forest_reg_rmse_scores.mean()

2.7289837027131165

In [29]:
#Support vector machine regressor

from sklearn.svm import SVR


In [30]:
svm_reg = SVR(kernel = 'linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels, scoring = 'neg_mean_squared_error', cv = 10)

svm_rmse_scores = np.sqrt(-svm_cv_scores)


In [31]:
svm_rmse_scores.mean()

3.08659162080283

In [32]:
#So far Random Forest performs the best due to lowest rmse mean score 

In [33]:
#Hyperparameter tuning  of the Random Forest using GridSearchCV

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
param_grid = [ 
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2, 3,4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, scoring = 'neg_mean_squared_error', return_train_score=True, cv = 10)

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8],

In [36]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [37]:
cv_scores = grid_search.cv_results_

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score),params)

3.51465286563239 {'max_features': 2, 'n_estimators': 3}
3.0816848389380898 {'max_features': 2, 'n_estimators': 10}
2.8664177894012988 {'max_features': 2, 'n_estimators': 30}
3.0843080220436425 {'max_features': 4, 'n_estimators': 3}
2.901622838057976 {'max_features': 4, 'n_estimators': 10}
2.73033841810104 {'max_features': 4, 'n_estimators': 30}
3.2641194664082573 {'max_features': 6, 'n_estimators': 3}
2.913702177189441 {'max_features': 6, 'n_estimators': 10}
2.72471972670558 {'max_features': 6, 'n_estimators': 30}
3.0581546026524995 {'max_features': 8, 'n_estimators': 3}
2.7057133820233763 {'max_features': 8, 'n_estimators': 10}
2.7039400232246957 {'max_features': 8, 'n_estimators': 30}
3.0509057435530065 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.8247267914863694 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.0431092038956447 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.9410906931627943 {'bootstrap': False, 'max_features': 3, 'n_estimat

In [38]:
#note: Max-Features = 8, n_estimators 30 is lowest

In [39]:
#checking feature importance

In [40]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.13477121, 0.26346519, 0.15892413, 0.22154053, 0.01517216,
       0.12192137, 0.02209112, 0.05590248, 0.00211967, 0.00250131,
       0.00159084])

In [41]:
extra_attributes = ["acceleration_on_power", "acceleration_on_cylinder"]
numerics = ['float64', 'int64']
numerical_attributes = list(data.select_dtypes(include = numerics))

attributes = numerical_attributes + extra_attributes
sorted(zip(attributes, feature_importances), reverse=True)

[('acceleration_on_power', 0.022091120537969914),
 ('acceleration_on_cylinder', 0.05590247907289417),
 ('Weight', 0.22154052681729497),
 ('Model Year', 0.12192136739918782),
 ('Horsepower', 0.15892413014460505),
 ('Displacement', 0.2634651852000586),
 ('Cylinders', 0.13477121256514996),
 ('Acceleration', 0.015172157884237644)]

In [42]:
#Evaluating the entire system on test data
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("MPG", axis = 1)
y_test = strat_test_set["MPG"].copy()

X_test_preprocessed = preprocess_origin_column(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_prediction = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_prediction)
final_rmse = np.sqrt(final_mse)



In [43]:
final_rmse

2.9152637816613898

In [46]:
#creating a funciton to cover this entire flow 

def predict_mpg(config, model):
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config

    preproc_df = preprocess_origin_column(df)
    prepared_df = pipeline_transformer(preproc_df)
    print(prepared_df)
    y_pred = model.predict(prepared_df)
    return y_pred 

In [47]:
#checking in on a random sample
vehicle_config = { 
    'Cylinders' : [4, 6, 8],
    'Displacement' : [155.0, 160.0, 165.5],
    'Horsepower' : [93.0, 130.0, 98.0],
    'Weight' : [2500.0, 3150.0,  2600.0],
    'Acceleration' : [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin' : [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

[[-1.22474487 -1.20484922 -0.85412443 -0.87481777  0.          1.06904497
   0.6684025   1.39127885  1.          0.          0.        ]
 [ 0.         -0.0388661   1.40320441  1.39970842 -1.22474487  0.26726124
  -1.41351982 -0.47596382  0.          0.          1.        ]
 [ 1.22474487  1.24371532 -0.54907999 -0.52489066  1.22474487 -1.33630621
   0.74511732 -0.91531503  0.          1.          0.        ]]


array([36.07      , 16.90333333, 20.32      ])

In [48]:
#Save the Model

In [49]:
import pickle

In [50]:
with open("model.bin", "wb") as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [51]:
#loading model from the saved file
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)

    

In [52]:
predict_mpg(vehicle_config, model)

[[-1.22474487 -1.20484922 -0.85412443 -0.87481777  0.          1.06904497
   0.6684025   1.39127885  1.          0.          0.        ]
 [ 0.         -0.0388661   1.40320441  1.39970842 -1.22474487  0.26726124
  -1.41351982 -0.47596382  0.          0.          1.        ]
 [ 1.22474487  1.24371532 -0.54907999 -0.52489066  1.22474487 -1.33630621
   0.74511732 -0.91531503  0.          1.          0.        ]]


array([36.07      , 16.90333333, 20.32      ])