# Multicollinearity in linear regression

##### EXAMPLE-1 --> WHERE NO MULTICOLLINEARITY IS PRESENT IN THE DATASET

In [1]:
import pandas as pd
import statsmodels.api as sm

In [7]:
path = '/home/ashish/projects/Multicollinearity_LinearRegression/Advertising.csv'
dataset = pd.read_csv(path,index_col=0)
dataset.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [9]:
## segregating the dependent and independent features

x = dataset.iloc[:,:-1]
y =  dataset.iloc[:,-1]
print(x)
print(y)

        TV  radio  newspaper
1    230.1   37.8       69.2
2     44.5   39.3       45.1
3     17.2   45.9       69.3
4    151.5   41.3       58.5
5    180.8   10.8       58.4
..     ...    ...        ...
196   38.2    3.7       13.8
197   94.2    4.9        8.1
198  177.0    9.3        6.4
199  283.6   42.0       66.2
200  232.1    8.6        8.7

[200 rows x 3 columns]
1      22.1
2      10.4
3       9.3
4      18.5
5      12.9
       ... 
196     7.6
197     9.7
198    12.8
199    25.5
200    13.4
Name: sales, Length: 200, dtype: float64


In [11]:
## This add_constant method will give a value of B0 or intercept-value for beginning
x = sm.add_constant(x)
x

Unnamed: 0,const,TV,radio,newspaper
1,1.0,230.1,37.8,69.2
2,1.0,44.5,39.3,45.1
3,1.0,17.2,45.9,69.3
4,1.0,151.5,41.3,58.5
5,1.0,180.8,10.8,58.4
...,...,...,...,...
196,1.0,38.2,3.7,13.8
197,1.0,94.2,4.9,8.1
198,1.0,177.0,9.3,6.4
199,1.0,283.6,42.0,66.2


In [13]:
## fit the OLS(Ordiniary least square) method

model = sm.OLS(y,x).fit()
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Wed, 13 Jul 2022",Prob (F-statistic):,1.58e-96
Time:,12:57:08,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [14]:
## Getting the correlation chart
import matplotlib.pyplot as plt

x.iloc[:,1:].corr()

Unnamed: 0,TV,radio,newspaper
TV,1.0,0.054809,0.056648
radio,0.054809,1.0,0.354104
newspaper,0.056648,0.354104,1.0


## IN above correlation chart it can be observed that none of the features are highly correlated with each other. ===> MULTICOLLINEARITY ABSENT

    1. tv and radio ==> 5.4% correlated.
    2. tv and newspaper ==> 5.6% correlated.
    3. newspaper and radio ==> 35.4% correlated.

#### Example-2 --> WHERE MULTICOLLINEARITY CAN BE SEEN IN THE DATASET

In [15]:
import pandas as pd
import statsmodels.api as sm

In [16]:
path = '/home/ashish/projects/Multicollinearity_LinearRegression/Salary_Data.csv' 
dataset2 = pd.read_csv(path)
dataset2.head()

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891


In [17]:
## segregating the independent and dependent features

x = dataset2.iloc[:,:-1]
y = dataset2.iloc[:,-1]
## APPROACH-2
## x = dataset2[['YearsExperience','Age']]
## y = datset2['Salary']

In [19]:
x.head()

Unnamed: 0,YearsExperience,Age
0,1.1,21.0
1,1.3,21.5
2,1.5,21.7
3,2.0,22.0
4,2.2,22.2


In [20]:
y.head()

0    39343
1    46205
2    37731
3    43525
4    39891
Name: Salary, dtype: int64

In [21]:
## Fit the OLS(Ordiniary Least Square) Method
x = sm.add_constant(x)
model = sm.OLS(y,x).fit()
model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.96
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,323.9
Date:,"Wed, 13 Jul 2022",Prob (F-statistic):,1.35e-19
Time:,13:14:54,Log-Likelihood:,-300.35
No. Observations:,30,AIC:,606.7
Df Residuals:,27,BIC:,610.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6661.9872,2.28e+04,-0.292,0.773,-5.35e+04,4.02e+04
YearsExperience,6153.3533,2337.092,2.633,0.014,1358.037,1.09e+04
Age,1836.0136,1285.034,1.429,0.165,-800.659,4472.686

0,1,2,3
Omnibus:,2.695,Durbin-Watson:,1.711
Prob(Omnibus):,0.26,Jarque-Bera (JB):,1.975
Skew:,0.456,Prob(JB):,0.372
Kurtosis:,2.135,Cond. No.,626.0


## HERE
    1. HIGH STANDARD ERRORS INDICATES THAT THERE MAYBE MULTICOLLINEARITY PRESENT AMOUNG SOME FEATURES.
    2. INSIGNIFICANT p-values indicates which value/feature to drop for removing the multicollinearity

In [25]:
## getting the correlation chart

import matplotlib.pyplot as plt

x.iloc[:,1:].corr()

Unnamed: 0,YearsExperience,Age
YearsExperience,1.0,0.987258
Age,0.987258,1.0


## SO
    we can finally confirm that Age and YearsExperience has a correlation of 98.72% which surely comprehends the presence of multicollinearity.