# Checking Assumptions of Linear Regression

## Importing Relevant Libraries!

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sy
from scipy import stats  
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

## __Importing dataset and creating initial regression plot__

In [None]:
Salt=pd.read_csv('WatershedSalt.csv')
Salt

In [None]:
plotSalt=sns.regplot(x='Roadway_Area',y='Salt_Conc',data=Salt,ci=None)

## __Calculating correlation and covariance__

In [None]:
stats.pearsonr(x=Salt["Salt_Conc"],y=Salt["Roadway_Area"])

In [None]:
Salt.cov()

In [None]:
Salt.corr()

## __Computing Regression & ANOVA Table__

We want to compute the regression model and then we can use the ANOVA table to evaluate statistical significance AND to estimate errors.

axvline adds a vertical line at given coordinates.

In [None]:
#Regression Model
model1=ols('Salt_Conc~Roadway_Area',data=Salt)
results1=model1.fit()
print(results1.summary2())

In [None]:
#Plotting regression line
plotSalt=sns.regplot(x='Roadway_Area',y='Salt_Conc',data=Salt,ci=None)
plotSalt.annotate(text='y fitted = 2.68 + 17.55x',xy=(.2,30),xycoords='data')
plotSalt.axvline(x=0.824,c="r",linestyle=":")
plotSalt.axhline(y=17.135,c="r",linestyle=":")

In [None]:
#ANOVA table for Model
anova_results = anova_lm(results1)
print(anova_results)

## __Examining residuals of model__
fittedvalues gives us the fitted values from the regression model. resid gives us the residuals from the regression model.

In [None]:
results1.fittedvalues

In [None]:
sns.scatterplot(x=results1.fittedvalues,y=results1.resid)
#plt.xlabel("fitted values")
#plt.ylabel("residuals")

In [None]:
sns.residplot(x='Roadway_Area',y='Salt_Conc',data=Salt)

In [None]:
# for SLR the residual vs fitted is the same as the residual vs predictor plot
sns.residplot(x=Salt["Roadway_Area"],y=results1.resid)
plt.xlabel("fitted values")
plt.ylabel("residuals")

## __Generating all residual plots__

graphics.plot_regress_exog generates four plots that you can use to assess your model. For this course, the top two will be the focus but the other two can be used as well.

Residual plots should not depart from 0 in a systematic fashion.
Top left is comparing the independent vs. dependent variables - it should show a linear trend.
Top right chart shows independent variable vs. residuals. It should NOT vary around the regression linear in a non-constant manner. 

Bottom left shows the partial regression plot and the bottom right shows the component and component+residual plot (CCPR). For us to meet the assumptions of linear regression, the data points should be close to the line. 

In [None]:
fig=plt.figure(figsize=(12,8))
fig=sm.graphics.plot_regress_exog(results1,'Roadway_Area',fig=fig)

## __Examining Normality of Residuals__
To check the assumptions of normality you can use the standard qq plots and shapiro-wilk tests.

In [None]:
sm.qqplot(results1.resid,line='s')
plt.show()

In [None]:
stats.shapiro(results1.resid)