## Assumptions in Linear Regression

### 1. Normally distributed residuals
+ residuals should be normally distributed

### 2. Little to no Multi-collinearity
+ multiple regression assumes the independent variables are not highly correlated with each other
    + This assumption is tested using Variance Inflation Factor (VIF) values
    + One way to deal with multicollinearity is subtracting mean.
    
### 3. Homoscedasticity
+ the variance of error terms are similar across the values of the independent variables


### Dummy variable trap
+ this occurs when there is redundant info with one-hot encoding
#### dummy variable trap can be aovided by dropping 1 feature off every subset of dumy variables

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# create a made up regression dataset
X, y = make_regression(n_samples=100, n_features=20, noise=0.95)

df = pd.DataFrame(X)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.853071,0.720807,0.826154,0.533065,-0.638515,-1.157354,2.254914,0.967134,-0.774395,0.377532,-0.120622,0.473265,1.365419,-0.126193,0.294963,-0.595122,0.720997,0.903397,-0.439863,0.409086
1,1.410129,1.041244,0.091753,0.805112,0.4471,-0.538493,0.110147,0.4589,-0.046058,0.365092,0.34842,1.479309,-0.41908,-1.283318,0.022853,-0.450748,-1.064205,-0.924653,0.745271,-0.267975
2,-0.740166,0.235217,1.395934,-0.92934,-1.116039,0.720455,0.750695,0.022273,1.864272,0.479582,0.763163,-0.892739,0.245497,-0.776675,0.386774,-1.469187,0.527167,-1.043011,0.588204,-0.278618
3,0.435785,-1.352781,-0.633021,0.595461,0.453337,0.001892,0.712503,-1.071495,-0.191208,1.08715,-0.181555,1.662098,0.314844,0.585712,0.989343,0.831048,0.279382,0.516801,0.210804,-0.393984
4,-0.317913,-1.412985,1.084717,-0.261119,0.722704,0.915681,0.246882,-0.417033,-0.635791,-0.857519,-0.490107,-0.018462,0.286349,1.857462,0.07953,-0.286662,-0.697396,0.564845,0.247666,0.045705


## Results on a dataset wiht 0 multicollinearity


In [2]:
# cross val with fir the classifier N number of times
cv = cross_val_score(LinearRegression(), X, y, cv=10)
print("Mean: {}".format(cv.mean()))
print("Values: {}".format(cv))

Mean: 0.9999427579418008
Values: [0.99987127 0.99994648 0.99994584 0.99996578 0.99996048 0.99997269
 0.99993631 0.99992213 0.99994144 0.99996517]


In [3]:
# create the dataset agian with high multi-collinearity
X, y = make_regression(n_samples=100, n_features=20, noise=0.95, effective_rank=1)
df = pd.DataFrame(X)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.024678,0.003725,-0.004385,0.005077,-0.001021,0.012764,-0.029182,-0.001212,0.01268,-0.017124,0.045635,-0.049328,0.002428,-0.01992,0.015971,-0.01865,-0.036631,-0.000878,0.013993,0.001105
1,0.015394,-0.04886,0.040701,0.050707,0.051627,0.027217,0.002266,-0.000831,-0.006741,-0.00792,0.075074,-0.009928,-0.006201,0.02796,-0.012349,0.07476,-0.056316,0.015182,-0.006946,0.018606
2,-0.028301,-0.006186,-0.002087,0.02542,0.009137,-0.035355,-0.061808,-0.020314,0.037305,-0.035198,-0.000306,-0.00043,0.024393,0.001994,-0.022815,-0.092783,-0.008826,0.005324,-0.030679,0.024349
3,-0.025618,0.00851,-0.048866,0.012913,-0.029878,-0.059766,0.050001,0.023067,-0.027618,0.001469,-0.045392,0.016945,-0.042711,-0.0171,0.034405,0.020176,-0.016785,-0.045649,-0.020356,-0.035665
4,0.011367,-0.03795,0.022344,0.015386,-0.000734,0.015546,0.046502,0.049933,-0.02132,0.022309,0.020697,-0.023467,-0.044146,-0.01278,-0.019484,0.049951,0.024478,0.007754,-0.027891,-0.02919


## Results on dataset with high multi-collinearity

In [4]:
cv = cross_val_score(LinearRegression(), X, y, cv=10)
print("Mean: {}".format(cv.mean()))
print("Values: {}".format(cv))

Mean: 0.9559077823126835
Values: [0.95700133 0.91871759 0.98837935 0.97550788 0.99101634 0.98642816
 0.92807295 0.98231432 0.84386175 0.98777814]
