In [3]:
import pandas as pd 

In [5]:
automobile = pd.read_csv('datasets/cars_processed.csv')

automobile.head(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Origin,Age
0,18.0,8,307.0,130,3504,12.0,US,49
1,16.0,8,304.0,150,3433,12.0,US,49
2,17.0,8,302.0,140,3449,10.5,US,49
3,14.0,8,454.0,220,4354,9.0,US,49
4,23.551429,8,440.0,215,4312,8.5,US,49


In [6]:
automobile.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
count,387.0,387.0,387.0,387.0,387.0,387.0,387.0
mean,23.672514,5.410853,192.184755,103.645995,2965.387597,15.573643,42.917313
std,7.736579,1.667795,103.703706,38.128651,846.332848,2.74626,3.668715
min,9.0,3.0,68.0,46.0,1613.0,8.0,37.0
25%,17.6,4.0,102.5,75.0,2221.5,13.9,40.0
50%,23.2,4.0,146.0,92.0,2790.0,15.5,43.0
75%,29.0,6.0,260.0,121.0,3589.5,17.05,46.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,49.0


In [7]:
from sklearn import preprocessing

# Standarize input features
automobile[['Cylinders']] = preprocessing.scale(automobile[['Cylinders']].astype('float64'))
automobile[['Displacement']] = preprocessing.scale(automobile[['Displacement']].astype('float64'))
automobile[['Horsepower']] = preprocessing.scale(automobile[['Horsepower']].astype('float64'))
automobile[['Weight']] = preprocessing.scale(automobile[['Weight']].astype('float64'))
automobile[['Acceleration']] = preprocessing.scale(automobile[['Acceleration']].astype('float64'))
automobile[['Age']] = preprocessing.scale(automobile[['Age']].astype('float64'))

In [8]:
automobile.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
count,387.0,387.0,387.0,387.0,387.0,387.0,387.0
mean,23.672514,3.672055e-17,1.101617e-16,1.101617e-16,1.101617e-16,2.203233e-16,8.078522e-16
std,7.736579,1.001294,1.001294,1.001294,1.001294,1.001294,1.001294
min,9.0,-1.447404,-1.199046,-1.513838,-1.600007,-2.761372,-1.615
25%,17.6,-0.847034,-0.8659368,-0.752271,-0.8800918,-0.6102152,-0.796216
50%,23.2,-0.847034,-0.4459295,-0.3058349,-0.2075007,-0.0268506,0.02256768
75%,29.0,0.3537065,0.6547792,0.4557326,0.738386,0.5382839,0.8413513
max,46.6,1.554447,2.53757,3.318176,2.572779,3.363956,1.660135


In [9]:
automobile.shape

(387, 8)

In [10]:
from sklearn.model_selection import train_test_split

X = automobile.drop(['MPG', 'Origin'], axis=1)
Y = automobile['MPG']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [11]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression(normalize=True).fit(x_train, y_train)

In [12]:
print("Training Score: ", linear_model.score(x_train, y_train))

Training Score:  0.7787976798539803


In [13]:
y_pred = linear_model.predict(x_test)

In [14]:
from sklearn.metrics import r2_score

print("Training Score: ", r2_score(y_test, y_pred))

Training Score:  0.8129028088720152


In [15]:
# For multiple predictor or multiple feature in a regression model 
# A mesaure to calculate how well the model performs is the adjustable r square
# Calculated base on the r2 score
### Is a goodness of fit measure ###
def adjusted_r2(r_square, labels, features):
    adj_r_square = 1 - ((1 - r_square) * (len(labels) - 1)) / (len(labels) - features.shape[1] - 1) 
    return adj_r_square

In [16]:
print("Adjusted_r2_score :" , adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

Adjusted_r2_score : 0.7970917786358475


In [17]:
features_corr = X.corr()

features_corr

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
Cylinders,1.0,0.922633,0.811466,0.873029,-0.458161,0.32185
Displacement,0.922633,1.0,0.894199,0.932822,-0.526901,0.357047
Horsepower,0.811466,0.894199,1.0,0.863388,-0.67092,0.404458
Weight,0.873029,0.932822,0.863388,1.0,-0.397181,0.299049
Acceleration,-0.458161,-0.526901,-0.67092,-0.397181,1.0,-0.292705
Age,0.32185,0.357047,0.404458,0.299049,-0.292705,1.0


In [18]:
abs(features_corr) > 0.8 

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
Cylinders,True,True,True,True,False,False
Displacement,True,True,True,True,False,False
Horsepower,True,True,True,True,False,False
Weight,True,True,True,True,False,False
Acceleration,False,False,False,False,True,False
Age,False,False,False,False,False,True


In [19]:
trimmed_features_df = X.drop(['Cylinders', 'Displacement', 'Weight'], axis=1)

In [20]:
trimmed_features_corr = trimmed_features_df.corr()

trimmed_features_corr

Unnamed: 0,Horsepower,Acceleration,Age
Horsepower,1.0,-0.67092,0.404458
Acceleration,-0.67092,1.0,-0.292705
Age,0.404458,-0.292705,1.0


In [21]:
abs(trimmed_features_corr) > 0.8 

Unnamed: 0,Horsepower,Acceleration,Age
Horsepower,True,False,False
Acceleration,False,True,False
Age,False,False,True


In [22]:
# Select feature that are not collinear

# Variance_inflation_factor calculates the severity of multicollinearity in an ordinary least squares regression analysis.

In [23]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [24]:
vif = pd.DataFrame()

vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [25]:
vif['features'] = X.columns

In [26]:
vif.round(2)
1# 1: Not correlated, 1-5: modertely correlated, >5 highly correlated

Unnamed: 0,VIF Factor,features
0,6.84,Cylinders
1,16.1,Displacement
2,8.82,Horsepower
3,10.69,Weight
4,2.49,Acceleration
5,1.22,Age


In [27]:
X = X.drop(['Displacement', 'Weight'], axis=1)

In [29]:
vif = pd.DataFrame()

vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [30]:
vif['features'] = X.columns

In [31]:
vif.round(2)

Unnamed: 0,VIF Factor,features
0,3.05,Cylinders
1,4.56,Horsepower
2,1.9,Acceleration
3,1.2,Age


In [32]:
X = automobile.drop(['MPG', 'Displacement', 'Weight', 'Origin'], axis=1)
Y = automobile['MPG']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [33]:
linear_model = LinearRegression(normalize=True).fit(x_train, y_train)

In [34]:
print("Training Score: ", linear_model.score(x_train, y_train))

Training Score:  0.7245601599810705


In [35]:
y_pred = linear_model.predict(x_test)

In [36]:
print("Training Score: ", r2_score(y_test, y_pred))

Training Score:  0.7150554810668306


In [39]:
print("Adjusted_r2_score :" , adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

Adjusted_r2_score : 0.6994420827691227
