In [130]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [131]:
#reading the .ldata file using pandas

columns = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names = columns, na_values = "?", comment = '\t', sep = " ", skipinitialspace = True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index, in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [132]:
#separate feature and target variable
data = strat_train_set.drop("MPG", axis = 1)
data_labels = strat_train_set["MPG"].copy()

data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [133]:
#Preprocess the Origin column in data
def preprocess_origin_column(df):
    df["Origin"] = df["Origin"].map(
        {
            1: "India",
            2: "USA",
            3: "Germany"
        }
    )
    return df

In [134]:
#creating custom attribute class 

acceleration_index, horsepower_index, cylinder_index = 4, 2, 0

class CustomAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acceleration_on_power = True):
        self.acceleration_on_power = acceleration_on_power
    def fit(self, X, y = None):
        return self
    def transform (self, X):
        acceleration_on_cylinder = X[:, acceleration_index] / X[:, cylinder_index]
        if self.acceleration_on_power == True:
            acceleration_on_power = X[:, acceleration_index] / X[:, horsepower_index]
            return np.c_[X, acceleration_on_power, acceleration_on_cylinder]
        return np.c[X, acceleration_on_cylinder]

In [135]:
#creating the pipeline

def numerical_pipeline_transformer(data):
    '''
    Processes numerical transformations of the dataframe and calls the pipeline class and creates a pipeline.
    Imputes missing values with the median value
    Adds custom attribute acceleration_on_power and acceleration_on_cylinder
    Scales numerical attributes with StandardScaler

    '''
    numerics = ['float64', 'int64']

    numerical_attributes = data.select_dtypes(include = numerics)

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy= "median")),
        ("attrs_adder", CustomAttributeAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return numerical_attributes, numerical_pipeline

In [136]:
def pipeline_transformer (data):
    '''
    Completes transformation pipeline for numerical and categorical data.
    One-hot encodes categorical data
    '''
    categorical_attributes = ["Origin"]
    numerical_attributes, numerical_pipeline = numerical_pipeline_transformer(data)

    full_pipeline = ColumnTransformer([
        ("num", numerical_pipeline, list(numerical_attributes)),
        ("cat", OneHotEncoder(), categorical_attributes),
    ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [137]:
#from raw data to processed data
preprocessed_df = preprocess_origin_column(data)
prepared_data = pipeline_transformer(preprocessed_df)

In [138]:
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [139]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

In [140]:
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,USA
156,8,400.0,170.0,4668.0,11.5,75,India
395,4,135.0,84.0,2295.0,11.6,82,India
14,4,113.0,95.0,2372.0,15.0,70,Germany


In [141]:
#selecting and training models (Linear Regression, Decision Tree, Random Forest, SVM regressor)

In [142]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [143]:
#testing predictions with the Linear Regression Model
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [144]:
#ctual label of the samples
print("Actual label of samples: ", list(sample_labels))

Actual label of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [145]:
#Root Mean Squared Error
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions) #these are the means squared error
lin_rmse = np.sqrt(lin_mse) #this finds the root meansquare error
lin_rmse

2.9590402225760872

In [146]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor

In [147]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [148]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse 

0.0

In [149]:
#The zero indicates the model overfits the data

In [150]:
#Model evaluation using cross-validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    tree_reg,
    prepared_data,
    data_labels,
    scoring = "neg_mean_squared_error",
    cv = 10
)

tree_reg_rmse_scores = np.sqrt(-scores)

In [151]:
tree_reg_rmse_scores

array([3.12594986, 3.09798321, 3.42253561, 3.54533144, 2.7427746 ,
       3.0288199 , 3.05511866, 4.21084908, 4.07114155, 2.62930139])

In [152]:
tree_reg_rmse_scores.mean()

3.292980530207692

In [153]:
scores = cross_val_score( 
     lin_reg,
     prepared_data,
     data_labels,
     scoring = "neg_mean_squared_error",
     cv = 10
)

lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [154]:
lin_reg_rmse_scores.mean()

3.075708179370933

In [155]:
#Random Forest Model
from sklearn.ensemble import RandomForestRegressor

In [156]:
forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score( 
    forest_reg,
    prepared_data,
    data_labels,
    scoring = 'neg_mean_squared_error',
    cv = 10
)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)


In [157]:
forest_reg_rmse_scores.mean()

2.7250097614457736

In [None]:
#Support vector machine regressor