# Check 5 Regression Assumptions

In [8]:
#Assumption 1 - linear relationship

def assumption1LinearRelationship(df, label):
  import pandas as pd
  import seaborn as sns
  from scipy import stats
  import matplotlib.pyplot as plt
  new_df = pd.DataFrame(columns=['r-value'])
  for col in df:
    if pd.api.types.is_numeric_dtype(df[col]) and col != label and df[col].isnull().sum() == 0:
      r = numericToNumericStats(df, col, label)[1][0]
      if abs(r) < 0.5:
        plot = createScatterPlot(df, col, label)
        new_df.loc[col] = [round(r,2)]
  new_df = new_df.sort_values(by=['r-value'], ascending=False)
  return new_df

In [9]:
#Assumption 2 - Multicollinearity

def assumption2Multicollinearity(df, label):
  from sklearn.linear_model import LinearRegression
  import pandas as pd
  lst = ['int16','int32','int64','float16','float32','float64']
  ndf = df.select_dtypes(include=lst)
  ndf = ndf.drop([label], axis=1)
  vif_dict = {}
  for col in ndf:
    y = ndf[col]
    x = ndf.drop(columns=[col])
    r2 = LinearRegression().fit(x,y).score(x,y)
    vif = 1/(1-r2)
    vif_dict[col] = round(vif,4)
  fin = pd.DataFrame({'VIF': vif_dict}).sort_values('VIF')
  fin = fin.sort_values(by='VIF', ascending=False)
  return fin


In [10]:
#assumption 3 - independance
def assumption3Independence(df, label):
  oString = ''
  import pandas as pd
  import statsmodels.api as sm
  from statsmodels.stats.stattools import durbin_watson
  import numpy as np
  results = mlr(df,label)
  dw = durbin_watson(results.resid)
  if dw > 2.5 or dw < 1.5:
    text = 'The Assumption is NOT met'
  else:
    text = 'The Assumption IS met'
  oString = text + f'\nDurbin Watson: {round(dw,3)}'
  return oString

In [11]:
#assumption 4

def assumption4Homoscedasticity(df, label):
  from statsmodels.compat import lzip
  import statsmodels.stats.api as sms
  import numpy as np
  import pandas as pd
  import statsmodels.api as sm
  model = mlr(df, label)
  bp_data = sms.het_breuschpagan(model.resid, model.model.exog)
  (l, p) = round(bp_data[0], 4), round(bp_data[1], 4)
  names = ['Lagrange multiplier statistic', 'p-value']
  bp_data_dict= dict(lzip(names, (l,p)))
  bp_df = pd.DataFrame(bp_data_dict, index = ['Breusch-Pagan Values'])
  if bp_df.iloc[0]['p-value'] <.05:
    text = "The Homoscedasticity Assumption is NOT met"
  else:
    text = "The Homoscedasticity Assumption IS met"
  print(text)
  return bp_df

In [12]:
#assumption 5

def assumption5MultivariateNormality(df, label):
  import pandas as pd
  import matplotlib.pyplot as plt
  from statsmodels.compat import lzip
  from scipy import stats
  import numpy as np
  import statsmodels.api as sm
  from statsmodels.stats.stattools import jarque_bera
  import scipy as sp
  results = mlr(df,label)
  jb, p, skew, kurt = jarque_bera(results.resid)
  jb_data = [round(jb, 4), p]
  names = ['Jarque-Bera test statistic', 'p-value']
  jb_data_dict= dict(lzip(names, jb_data))
  jb_df = pd.DataFrame(jb_data_dict, index = ['Jarque-Bera Values'])
  print(jb_df)
  fig, ax = plt.subplots()
  _,(_,_,r) = sp.stats.probplot(results.resid, plot=ax, fit = True)
  if round(r**2,1) >= 0.9:
    text = "The Multivariate Normality Assumption IS met"
  else:
    text = "The Multivariate Normality Assumption is NOT met"
  print(f'R2 = {round(r**2,4)}')
  return text

In [13]:
#parent function 

def assumptions(df, label) :
  print("Assumption #1: Linear Relationship\n")
  print(f'\nVariables that don\'t have a linear relationship with {label}\n')
  print(assumption1LinearRelationship(df, label))
  print("\nAssumption #2: Multicollinearity\n")
  print(assumption2Multicollinearity(df, label))
  print("\nAssumption #3: Independence\n")
  print(assumption3Independence(df, label))
  print("\nAssumption #4: Homoscedasticity\n")
  print(assumption4Homoscedasticity(df, label))
  print("\nAssumption #5: Multivariate Normality\n")
  print(assumption5MultivariateNormality(df, label))