In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn import metrics
#pd.set_option('display.mpl_style','default')
plt.rcParams['figure.figsize'] = (15,5)

#Viewing Summary()
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq,df)

In [2]:
raw_data = pd.read_csv('Data/Auto model.csv')
raw_data.head()

Unnamed: 0,Year,Year70To81,MPG,GallonsPer100Miles,GallonsPer100MilesTo1981,Cylinders,Displacement100ci,Horsepower100,Weight1000lb,Seconds0to60,Origin,Origin.Eq.1,Origin.Eq.2,Origin.Eq.3,Name
0,70,1,18.0,5.6,5.6,8,3.07,1.3,3.504,12.0,1,1,0,0,chevrolet chevelle malibu
1,70,1,15.0,6.7,6.7,8,3.5,1.65,3.693,11.5,1,1,0,0,buick skylark 320
2,70,1,18.0,5.6,5.6,8,3.18,1.5,3.436,11.0,1,1,0,0,plymouth satellite
3,70,1,16.0,6.3,6.3,8,3.04,1.5,3.433,12.0,1,1,0,0,amc rebel sst
4,70,1,17.0,5.9,5.9,8,3.02,1.4,3.449,10.5,1,1,0,0,ford torino


In [3]:
# the independent variables set 
X = raw_data[['Year','MPG','Cylinders','Displacement100ci','Horsepower100','Weight1000lb','Seconds0to60','Origin','Origin.Eq.1','Origin.Eq.2','Origin.Eq.3']]
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

              feature        VIF
0                Year   2.066410
1                 MPG   5.688265
2           Cylinders  10.802928
3   Displacement100ci  23.525860
4       Horsepower100  10.003006
5        Weight1000lb  14.107926
6        Seconds0to60   2.630353
7              Origin        inf
8         Origin.Eq.1        inf
9         Origin.Eq.2        inf
10        Origin.Eq.3        inf


  vif = 1. / (1. - r_squared_i)


In [4]:
# the independent variables set 
X1 = raw_data[['Year','MPG','Cylinders','Horsepower100','Weight1000lb','Seconds0to60','Origin','Origin.Eq.1','Origin.Eq.2','Origin.Eq.3']]
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X1.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X1.values, i) 
                          for i in range(len(X1.columns))] 
  
print(vif_data)

         feature        VIF
0           Year   1.998087
1            MPG   5.546115
2      Cylinders   6.256659
3  Horsepower100   9.100488
4   Weight1000lb  11.531695
5   Seconds0to60   2.602127
6         Origin        inf
7    Origin.Eq.1        inf
8    Origin.Eq.2        inf
9    Origin.Eq.3        inf


  vif = 1. / (1. - r_squared_i)


In [5]:
x = raw_data[['Year','MPG','Cylinders','Horsepower100','Weight1000lb','Seconds0to60','Origin','Origin.Eq.1','Origin.Eq.2','Origin.Eq.3']]
y = raw_data['GallonsPer100Miles']

In [6]:
x = sm.add_constant(x)
results = sm.OLS(y,x).fit()
results.summary()

0,1,2,3
Dep. Variable:,GallonsPer100Miles,R-squared:,0.933
Model:,OLS,Adj. R-squared:,0.932
Method:,Least Squares,F-statistic:,671.7
Date:,"Tue, 03 Jan 2023",Prob (F-statistic):,3.97e-220
Time:,19:29:22,Log-Likelihood:,-225.25
No. Observations:,392,AIC:,468.5
Df Residuals:,383,BIC:,504.2
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.7679,0.234,11.830,0.000,2.308,3.228
Year,-0.0440,0.008,-5.212,0.000,-0.061,-0.027
MPG,-0.1110,0.007,-16.735,0.000,-0.124,-0.098
Cylinders,0.0825,0.032,2.557,0.011,0.019,0.146
Horsepower100,1.0165,0.172,5.897,0.000,0.678,1.355
Weight1000lb,0.3695,0.088,4.203,0.000,0.197,0.542
Seconds0to60,0.0432,0.013,3.356,0.001,0.018,0.068
Origin,1.8861,0.155,12.194,0.000,1.582,2.190
Origin.Eq.1,2.7669,0.241,11.503,0.000,2.294,3.240

0,1,2,3
Omnibus:,120.911,Durbin-Watson:,1.348
Prob(Omnibus):,0.0,Jarque-Bera (JB):,497.972
Skew:,1.297,Prob(JB):,7.36e-109
Kurtosis:,7.874,Cond. No.,3.87e+17


In [7]:
scaler = StandardScaler()
scaler.fit(x)

StandardScaler()

In [8]:
input_scaled = scaler.transform(x)

In [9]:
#Trainsplit
x_train, x_test, y_train, y_test = train_test_split(input_scaled, y, test_size=0.2, random_state=33, shuffle=True)

In [10]:
reg = LinearRegression()

In [11]:
reg.fit(x_train, y_train)

LinearRegression()

In [12]:
sum_tab = pd.DataFrame(x.columns.values, columns = ['Features'])

In [13]:
sum_tab['Coefficients'] = reg.coef_
sum_tab

Unnamed: 0,Features,Coefficients
0,const,0.0
1,Year,-0.164623
2,MPG,-0.833412
3,Cylinders,0.180121
4,Horsepower100,0.417321
5,Weight1000lb,0.32092
6,Seconds0to60,0.150684
7,Origin,0.01299
8,Origin.Eq.1,-0.006384
9,Origin.Eq.2,-0.011275


In [14]:
reg.intercept_

4.797999657342747

In [15]:
reg.score(x_train,y_train)

0.9350411605852447

In [16]:
#F-statistics
f_regression(x_train,y_train)

  corr /= X_norms


(array([          nan,  120.60976558, 2139.72442445,  824.98317311,
         829.67303525, 1167.091533  ,   77.12081132,  131.48629753,
         146.29892202,   26.52136865,   64.84542615]),
 array([            nan, 6.25326555e-024, 1.87683733e-141, 1.73006709e-089,
        9.10955220e-090, 2.76581928e-107, 1.09790408e-016, 1.26565524e-025,
        7.29292440e-028, 4.63422159e-007, 1.74042826e-014]))

In [17]:
sum_tab['F-statistics'] = f_regression(x_train,y_train)[0]
sum_tab['P-Values'] = f_regression(x_train,y_train)[1].round(4)
sum_tab

  corr /= X_norms
  corr /= X_norms


Unnamed: 0,Features,Coefficients,F-statistics,P-Values
0,const,0.0,,
1,Year,-0.164623,120.609766,0.0
2,MPG,-0.833412,2139.724424,0.0
3,Cylinders,0.180121,824.983173,0.0
4,Horsepower100,0.417321,829.673035,0.0
5,Weight1000lb,0.32092,1167.091533,0.0
6,Seconds0to60,0.150684,77.120811,0.0
7,Origin,0.01299,131.486298,0.0
8,Origin.Eq.1,-0.006384,146.298922,0.0
9,Origin.Eq.2,-0.011275,26.521369,0.0


In [18]:
pvalues = f_regression(x_train,y_train)[1].round(3)

  corr /= X_norms
