# Project Overview

 - *Objective:* Predict Miles Per Gallon of a given vehicle based on other relevant attributes provided
 - The data used for the project comes from the UCI Machine Learning Repository and can be found in the link below:
   http://archive.ics.uci.edu/ml/datasets/Auto+MPG



In [14]:
#declaring dependancies
import numpy as np
import seaborn as sns
import pandas as pd
import pickle
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error

In [16]:
# defining the column names
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
# reading the .data file using pandas
df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)
#making a copy of the dataframe
data = df.copy()

In [None]:
df.head()

# Data Exploratory Analysis

In [None]:
df.describe()

In [None]:
df.isnull().sum()

## Checking outliers in the Horsepower column

In [None]:
#checking the horsepower
sns.boxplot(df['Horsepower'])

## Handling the outliers problem

In [None]:
median = df['Horsepower'].median()
df['Horsepower'] = df['Horsepower'].fillna(median) 

## Categorical columns distribution

In [None]:
#2 columns are categorical cylinders and origin
df['Cylinders'].value_counts() / len(df['Cylinders'])

In [None]:
df['Origin'].value_counts() / len(df['Origin'])

# Correlation plots

In [None]:
sns.pairplot(data[["MPG", "Cylinders", "Displacement", "Weight", "Horsepower"]], diag_kind="kde")

## Train and Test data split
 - In order to ensure that the dataset is properely split, we use stratified sampeling. Here is a definition of the method: "Stratified Sampling — We create homogeneous subgroups called strata from the overall population and sample the right number of instances to each stratum to ensure that the test set is representative of the overall population"

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['Cylinders']):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]

In [None]:
train_set['Cylinders'].value_counts() / len(train_set['Cylinders'])

In [None]:
test_set['Cylinders'].value_counts() / len(test_set['Cylinders'])

## One-hot-encoding

In [None]:
#replace number by country to then create a one-hot-encoding
train_set['Origin'] = train_set['Origin'].map({1: 'India', 2: 'USA', 3 : 'Germany'})
train_set = get_dummies(train_set, prefix ='', prefix_sep='')
train_set.head(10)

## Testing new variables

In [None]:
#creating new variables for testing
df['displacement_on_power'] = df['Displacement'] / df['Horsepower']
df['weight_on_cylinder'] = df['Weight'] / df['Cylinders']
df['acceleration_on_power'] = df['Acceleration'] / df['Horsepower']
df['acceleration_on_cyl'] = df['Acceleration'] / df['Cylinders']

In [None]:
correlation_matrix = df.corr()
correlation_matrix['MPG'].sort_values(ascending=False)

# Data Preparation
- This step performs the following steps:
 - adjust the origin column
 - handle the missing values 
 - add the feature engineering variables
 - adjust the ctaegorial data

In [89]:
origin_dic = {1: 'India', 2: 'USA', 3 : 'Germany'}

#standard scalar parameters
std_scalar_parameters = {'mean':np.array([5.46250000e+00, 1.97468750e+02, 1.06687500e+02, 2.94396250e+03,
       1.52875000e+01, 7.60625000e+01, 3.13020833e+00, 1.69925810e-01]), 'std':np.array([2.92359375e+00, 1.29909209e+04, 1.78433359e+03, 8.13062411e+05,
       7.26084375e+00, 1.43585938e+01, 1.33059136e+00, 6.18225592e-03])}

def _encode_origin_column(df):
    df['Origin'] = df['Origin'].map(origin_dic)
    return df

def _adjust_missing_values(df):
    numeric_columns = df.select_dtypes(['float64','int64']).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())
    return df

def _add_custom_variables(df):
    df = df.assign(acceleration_on_cyl=df['Acceleration']/df['Cylinders'],
                   acceleration_on_power=df['Acceleration'] / df['Horsepower'])
    return df

def prepare_data(df):
    
    df = (
        df.pipe(_encode_origin_column).
        pipe(_adjust_missing_values).
        pipe(_add_custom_variables)
    )
    
    return df

def transform_columns(df):
    numeric_columns = df.select_dtypes(['float64','int64']).columns
    category_columns = ['Origin']
    
    transformer = ColumnTransformer([
                    ('standard_scalar',StandardScaler(with_mean=std_scalar_parameters['mean'],with_std=std_scalar_parameters['std']), list(numeric_columns)),
                    ('one_hot_encoding',OneHotEncoder(),category_columns)
                ])
    return transformer.fit_transform(df)

# Evaluating models

In [90]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['Cylinders']):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]
    
train_data = train_set.drop('MPG',axis=1)
train_data_labels = train_set['MPG']
test_data = test_set.drop('MPG',axis=1)
test_data_labels = test_set['MPG']

In [91]:
df_train_data = prepare_data(train_data)
prepared_train_data  = transform_columns(df_train_data)
df_test_data = prepare_data(test_data)
prepared_test_data  = transform_columns(df_test_data)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [60]:
from unicodedata import category


enc = OneHotEncoder(categories=[["India", "USA", "Germany"]])
dt = enc.fit_transform(df_train_data[['Origin']])

In [61]:
enc.categories

[['India', 'USA', 'Germany']]

In [82]:
df_train_data 
numeric_columns = df_train_data .select_dtypes(['float64','int64']).columns
transformed_data = StandardScaler().fit(df_test_data[numeric_columns])
transformed_data.mean_


array([5.46250000e+00, 1.97468750e+02, 1.06687500e+02, 2.94396250e+03,
       1.52875000e+01, 7.60625000e+01, 3.13020833e+00, 1.69925810e-01])

In [85]:
transformed_data.var_

array([2.92359375e+00, 1.29909209e+04, 1.78433359e+03, 8.13062411e+05,
       7.26084375e+00, 1.43585938e+01, 1.33059136e+00, 6.18225592e-03])

In [66]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [67]:
models_dic = {'Linear Regression':LinearRegression(),
              'Decicion Tree': DecisionTreeRegressor(),
              'SVR':SVR(),
              'Random Forest':RandomForestRegressor()}

for k, v in models_dic.items():
    scores = cross_val_score(v, prepared_train_data, train_data_labels, scoring="neg_mean_squared_error", cv = 10)
    rmse_score = np.sqrt(-scores).mean()
    print('{model} rmse mean score is {score}'.format(model=k,score=rmse_score))

Linear Regression rmse mean score is 3.0757081793709324
Decicion Tree rmse mean score is 3.2343984612835825
SVR rmse mean score is 3.164117785619293
Random Forest rmse mean score is 2.5566970807567366


# Hyperparameters tunning

In [68]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )
grid_search.fit(prepared_train_data, train_data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [69]:
#best pararmets
final_model = grid_search.best_estimator_

In [70]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

# Testing model

In [72]:
final_predictions = final_model.predict(prepared_test_data)
final_mse = mean_squared_error(test_data_labels, final_predictions)
final_rmse = np.sqrt(final_mse)

In [73]:
final_rmse

2.98649694923296

# Final model function

In [74]:
def predict_mpg(data, model):
    
    if type(data) == dict:
        df = pd.DataFrame(data)
    else:
        df = data
        
    prepared_data = prepare_data(df)
    prepared_data = transform_columns(prepared_data)
    
    return model.predict(prepared_data)

In [87]:
##checking it on a random sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

vehicle_config = {
    'Cylinders': [4],
    'Displacement': [155.0],
    'Horsepower': [93.0],
    'Weight': [2500.0],
    'Acceleration': [15.0],
    'Model Year': [81],
    'Origin': [3]
}

In [88]:
predict_mpg(vehicle_config,final_model)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [41]:
with open('model.bin', 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [43]:
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)

predict_mpg(vehicle_config, model)

array([33.16666667, 17.01      , 20.41666667])

In [37]:
import requests
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

url = 'https://ds-miles-per-gallon-prediction.herokuapp.com/predict'
r = requests.post(url, json = vehicle_config)
r.text.strip()

'{"mpg_prediction":[33.16666666666666,17.009999999999998,20.416666666666668]}'