# Univariate Analysis

In [2]:
def calculateUnivariateStatsViz(df):
  import pandas as pd
  import matplotlib.pyplot as plt
  import seaborn as sns
  new_df = pd.DataFrame(columns=['Count', 'Unique', 'Data Type', 'Missing', 'Mode', 'Min', '25%', 'Median', '75%', 'Max', 'STD Dev', 'Mean', 'Skew', 'Kurt'])
  for col in df:
    if pd.api.types.is_numeric_dtype(df[col]):
      f, (ax_box, ax) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': (.15,.65)})
      sns.set(style = 'ticks')
      flierprops = dict(marker = 'o', markersize = 4, markerfacecolor = 'none', linestyle = 'none', markeredgecolor = 'gray')
      sns.boxplot(x = df[col], ax=ax_box, fliersize=4, width=.50,linewidth=1, flierprops=flierprops)
      sns.histplot(df, x=df[col])
      sns.despine(ax=ax)
      sns.despine(ax=ax_box, left = True, bottom = True)
      ax_box.set_title(col, fontsize=14)
      new_df.loc[col] = [df[col].count(),round(df[col].nunique(), 2), str(df[col].dtype), round(df[col].isnull().sum(), 2), df[col].mode().values[0], round(df[col].min(), 2), round(df[col].quantile(.25), 2), round(df[col].median(), 2), round(df[col].quantile(.75), 2), round(df[col].max(), 2), round(df[col].std(), 2), round(df[col].mean(), 2), round(df[col].skew(), 2), round(df[col].kurt(), 2)]
      text = 'Count: ' + str(df[col].count()) + '\n'
      text += 'Unique: ' + str(round(df[col].nunique(), 2)) + '\n'
      text += 'Data Type: ' + str(df[col].dtype) + '\n'
      text += 'Missing: ' + str(round(df[col].isnull().sum(), 2)) + '\n'
      text += 'Mode: ' + str(df[col].mode().values[0]) + '\n'
      text += 'Min: ' + str(round(df[col].min(), 2)) + '\n'
      text += '25%: ' + str(round(df[col].quantile(.25), 2)) + '\n'
      text += 'Median: ' + str(round(df[col].median(), 2)) + '\n'
      text += '75%: ' + str(round(df[col].quantile(.75), 2)) + '\n'
      text += 'Max: ' + str(round(df[col].max(), 2)) + '\n'
      text += 'Std Dev: ' + str(round(df[col].std(), 2)) + '\n'
      text += 'Mean: ' + str(round(df[col].mean(), 2)) + '\n'
      text += 'Skew: ' + str(round(df[col].skew(), 2)) + '\n'
      text += 'Kurt: ' + str(round(df[col].kurt(), 2)) + '\n'
      ax.text(.9,.25,text,fontsize=10,transform=plt.gcf().transFigure)
      plt.show()
    else:
      ax_count = sns.countplot(x=col, data=df, order=df[col].value_counts().index, palette=sns.color_palette('RdBu_r', df[col].nunique()))
      sns.despine(ax=ax_count)
      ax_count.set_title(col)
      ax_count.set_xlabel(col)
      ax_count.set_ylabel('')
      new_df.loc[col] = [df[col].count(),round(df[col].nunique(), 2), str(df[col].dtype), round(df[col].isnull().sum(), 2),'NA','NA','NA','NA','NA','NA','NA','NA','NA','NA']
      text = 'Count: ' + str(df[col].count()) + '\n'
      text += 'Unique: ' + str(round(df[col].nunique(), 2)) + '\n'
      text += 'Data Type: ' + str(df[col].dtype) + '\n'
      text += 'Missing: ' + str(round(df[col].isna().sum(), 2)) + '\n'
      ax_count.text(.9,.5,text,fontsize=10,transform=plt.gcf().transFigure)
      plt.show()
  return new_df


# Bivariate Analysis

In [3]:
#calculate TTest

def calculateTTest(df,feature,label):
  import pandas as pd
  from scipy import stats
  oString = ''
  feats = df[feature].unique()
  df1 = df[df[feature] == feats[0]]
  df2 = df[df[feature] == feats[1]]
  t, p = stats.ttest_ind(df1[label], df2[label])
  oString += 'T-TEST \nt stat: ' + str(round(t, 2))
  oString += '\np value: ' + str(round(p, 2))
  return oString, t, p

In [4]:
#Anova

def calculateANOVA(df, feature, label):
  import pandas as pd
  from scipy import stats
  from statsmodels.stats.multicomp import pairwise_tukeyhsd
  oString = ''
  groups = []
  columns = df[feature].unique()
  for col in columns:
    groups.append(df[df[feature] == col][label])
  f, p =  stats.f_oneway(*groups)
  tukey = pairwise_tukeyhsd(endog=df[label], groups=df[feature])
  oString += 'ANOVA \nF stat: ' + str(round(f,2)) + '\np value: ' + str(round(p,2)) 
  return oString, tukey, f, p

In [5]:
#BarChart

def createBarChart(df, feature, label):
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt
  groups = df[feature].unique()
  if len(groups) > 2:
    result = calculateANOVA(df,feature,label)[0]
    tukey = calculateANOVA(df ,feature, label)[1]
    stat = calculateANOVA(df ,feature, label)[2]
    p = calculateANOVA(df ,feature, label)[3]
  elif len(groups) == 2:
    result = calculateTTest(df,feature,label)[0]
    tukey = ''
    stat = calculateTTest(df,feature,label)[1]
    p = calculateTTest(df,feature,label)[2]
  print(tukey)
  plot = sns.barplot(data=df, x=feature, y=label)
  plot.text(1,0.8,result,fontsize=12,transform=plt.gcf().transFigure)
  plt.show()
  return plot, groups, stat, p

In [6]:
#ScatterPlot

def numericToNumericStats(df, feature, label):
  import pandas as pd
  from scipy import stats
  import numpy as np
  oString = ''
  r, p = stats.pearsonr(df[feature],df[label])
  model = np.polyfit(df[feature], df[label], 1)
  r2 = r**2
  equation = 'y = ' + str(round(model[0],2)) + 'x + ' + str(round(model[1],2))
  oString += 'r value: ' + str(round(r,2)) + '\np value: ' + str(round(p,2))
  oString += '\nLinear Regression Equation: ' + equation + '\nr squared: ' + str(round(r2,2))
  return [oString, [r,p,r2,equation]]

def createScatterPlot(df,feature,label):
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt
  text = numericToNumericStats(df, feature, label)[0]
  plot = sns.jointplot(x=df[feature], y=df[label], kind='reg')
  plot.fig.text(1,0.8,text,fontsize=12,transform=plt.gcf().transFigure)
  plt.show()
  return plot

In [7]:
def calculateBivariateStatsViz(df, label):
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt
  new_df = pd.DataFrame(columns = ['Stat', '+/-', 'Effect Size', 'p-value'])
  for col in df:
    if col != label:
      if pd.api.types.is_numeric_dtype(df[col]):
        r,p,r2,eq = numericToNumericStats(df, col, label)[1]
        if r > 0:
          sign = '+'
        else:
          sign = '-'
        new_df.loc[col] = ['r', sign, round(abs(r),2), round(p,2)]
        stat = 'r'
        plot = createScatterPlot(df, col, label)
        plot.fig.suptitle(col)
      else:
        plot, groups, val, p = createBarChart(df, col, label)
        plot.set_title(col)
        if len(groups) > 2:
          stat = 'F'
        else:
          stat = 'T'
        new_df.loc[col] = [stat, ' ', round(val,2) , round(p,2)]
  plt.show()
  print(plot)
  return new_df

# Check 5 Regression Assumptions

In [8]:
#Assumption 1 - linear relationship

def assumption1LinearRelationship(df, label):
  import pandas as pd
  import seaborn as sns
  from scipy import stats
  import matplotlib.pyplot as plt
  new_df = pd.DataFrame(columns=['r-value'])
  for col in df:
    if pd.api.types.is_numeric_dtype(df[col]) and col != label and df[col].isnull().sum() == 0:
      r = numericToNumericStats(df, col, label)[1][0]
      if abs(r) < 0.5:
        plot = createScatterPlot(df, col, label)
        new_df.loc[col] = [round(r,2)]
  new_df = new_df.sort_values(by=['r-value'], ascending=False)
  return new_df

In [9]:
#Assumption 2 - Multicollinearity

def assumption2Multicollinearity(df, label):
  from sklearn.linear_model import LinearRegression
  import pandas as pd
  lst = ['int16','int32','int64','float16','float32','float64']
  ndf = df.select_dtypes(include=lst)
  ndf = ndf.drop([label], axis=1)
  vif_dict = {}
  for col in ndf:
    y = ndf[col]
    x = ndf.drop(columns=[col])
    r2 = LinearRegression().fit(x,y).score(x,y)
    vif = 1/(1-r2)
    vif_dict[col] = round(vif,4)
  fin = pd.DataFrame({'VIF': vif_dict}).sort_values('VIF')
  fin = fin.sort_values(by='VIF', ascending=False)
  return fin


In [10]:
#assumption 3 - independance
def assumption3Independence(df, label):
  oString = ''
  import pandas as pd
  import statsmodels.api as sm
  from statsmodels.stats.stattools import durbin_watson
  import numpy as np
  results = mlr(df,label)
  dw = durbin_watson(results.resid)
  if dw > 2.5 or dw < 1.5:
    text = 'The Assumption is NOT met'
  else:
    text = 'The Assumption IS met'
  oString = text + f'\nDurbin Watson: {round(dw,3)}'
  return oString

In [11]:
#assumption 4

def assumption4Homoscedasticity(df, label):
  from statsmodels.compat import lzip
  import statsmodels.stats.api as sms
  import numpy as np
  import pandas as pd
  import statsmodels.api as sm
  model = mlr(df, label)
  bp_data = sms.het_breuschpagan(model.resid, model.model.exog)
  (l, p) = round(bp_data[0], 4), round(bp_data[1], 4)
  names = ['Lagrange multiplier statistic', 'p-value']
  bp_data_dict= dict(lzip(names, (l,p)))
  bp_df = pd.DataFrame(bp_data_dict, index = ['Breusch-Pagan Values'])
  if bp_df.iloc[0]['p-value'] <.05:
    text = "The Homoscedasticity Assumption is NOT met"
  else:
    text = "The Homoscedasticity Assumption IS met"
  print(text)
  return bp_df

In [12]:
#assumption 5

def assumption5MultivariateNormality(df, label):
  import pandas as pd
  import matplotlib.pyplot as plt
  from statsmodels.compat import lzip
  from scipy import stats
  import numpy as np
  import statsmodels.api as sm
  from statsmodels.stats.stattools import jarque_bera
  import scipy as sp
  results = mlr(df,label)
  jb, p, skew, kurt = jarque_bera(results.resid)
  jb_data = [round(jb, 4), p]
  names = ['Jarque-Bera test statistic', 'p-value']
  jb_data_dict= dict(lzip(names, jb_data))
  jb_df = pd.DataFrame(jb_data_dict, index = ['Jarque-Bera Values'])
  print(jb_df)
  fig, ax = plt.subplots()
  _,(_,_,r) = sp.stats.probplot(results.resid, plot=ax, fit = True)
  if round(r**2,1) >= 0.9:
    text = "The Multivariate Normality Assumption IS met"
  else:
    text = "The Multivariate Normality Assumption is NOT met"
  print(f'R2 = {round(r**2,4)}')
  return text

In [13]:
#parent function 

def assumptions(df, label) :
  print("Assumption #1: Linear Relationship\n")
  print(f'\nVariables that don\'t have a linear relationship with {label}\n')
  print(assumption1LinearRelationship(df, label))
  print("\nAssumption #2: Multicollinearity\n")
  print(assumption2Multicollinearity(df, label))
  print("\nAssumption #3: Independence\n")
  print(assumption3Independence(df, label))
  print("\nAssumption #4: Homoscedasticity\n")
  print(assumption4Homoscedasticity(df, label))
  print("\nAssumption #5: Multivariate Normality\n")
  print(assumption5MultivariateNormality(df, label))