### Multicollinearity In Linear Regression

In [1]:
import pandas as pd
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [2]:
df= pd.read_csv('Advertising.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [3]:
x = df[['TV', 'radio','newspaper']]
y = df['sales']
print(x.shape)
print(y.shape)

(200, 3)
(200,)


In [4]:
X=sm.add_constant(x)
print(x)

        TV  radio  newspaper
0    230.1   37.8       69.2
1     44.5   39.3       45.1
2     17.2   45.9       69.3
3    151.5   41.3       58.5
4    180.8   10.8       58.4
..     ...    ...        ...
195   38.2    3.7       13.8
196   94.2    4.9        8.1
197  177.0    9.3        6.4
198  283.6   42.0       66.2
199  232.1    8.6        8.7

[200 rows x 3 columns]


In [5]:
## fit a OLS(ordinary lease squared) model with intercept on TV and Radio

model = sm.OLS(y,X).fit()

In [6]:
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Fri, 22 Apr 2022",Prob (F-statistic):,1.58e-96
Time:,11:29:11,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [7]:
import matplotlib.pyplot as plt
x.corr()

Unnamed: 0,TV,radio,newspaper
TV,1.0,0.054809,0.056648
radio,0.054809,1.0,0.354104
newspaper,0.056648,0.354104,1.0


**since that all after checking correlation with features columns every values are less than 90% so we can say that there are not multicolliniarity present here**

In [8]:
import pandas as pd
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [9]:
df_salary=pd.read_csv('salary.csv')
df_salary.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


In [10]:
x=df_salary[['yrs.since.phd','yrs.service']]
y=df_salary['salary']
print(x.shape)
print(y.shape)

(397, 2)
(397,)


In [14]:
cat_column=[]
for i in df_salary.columns:
    if df_salary[i].dtype == 'object':
        cat_column.append(i)
print(cat_column)

['rank', 'discipline', 'sex']


In [16]:
X= sm.add_constant(x)
print(x)

     yrs.since.phd  yrs.service
0               19           18
1               20           16
2                4            3
3               45           39
4               40           41
..             ...          ...
392             33           30
393             31           19
394             42           25
395             25           15
396              8            4

[397 rows x 2 columns]


In [17]:
model = sm.OLS(y,X).fit()

In [18]:
model.summary()

0,1,2,3
Dep. Variable:,salary,R-squared:,0.188
Model:,OLS,Adj. R-squared:,0.184
Method:,Least Squares,F-statistic:,45.71
Date:,"Fri, 22 Apr 2022",Prob (F-statistic):,1.4e-18
Time:,11:33:19,Log-Likelihood:,-4617.9
No. Observations:,397,AIC:,9242.0
Df Residuals:,394,BIC:,9254.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.991e+04,2843.560,31.620,0.000,8.43e+04,9.55e+04
yrs.since.phd,1562.8889,256.820,6.086,0.000,1057.981,2067.797
yrs.service,-629.1014,254.469,-2.472,0.014,-1129.389,-128.814

0,1,2,3
Omnibus:,14.927,Durbin-Watson:,1.867
Prob(Omnibus):,0.001,Jarque-Bera (JB):,15.947
Skew:,0.429,Prob(JB):,0.000344
Kurtosis:,3.478,Cond. No.,69.6


In [19]:
import seaborn as sns
x.corr()

Unnamed: 0,yrs.since.phd,yrs.service
yrs.since.phd,1.0,0.909649
yrs.service,0.909649,1.0


**see here the correlation between yrs.since.phd and yrs.service are above 90 percent .so we can drop one columns.because one feature information is being captured by another feature. droping one feature may be a good option.**