In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
%matplotlib inline
from statsmodels.stats.diagnostic import linear_rainbow, het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor
sns.set_style('darkgrid')

df = pd.read_csv('../asp_original', index_col=0)

def check_for_assumptions(modelname):
    rsquared = modelname.rsquared
    params = modelname.params
    print(f'Rsquared of Model: {rsquared}')
    print('----------')
    print('Beta values of Model:')
    print(params)
    rainbow_statistic, rainbow_p_value = linear_rainbow(modelname)
    print("Rainbow statistic:", rainbow_statistic)
    print("Rainbow p-value:", rainbow_p_value)
    fig, ax = plt.subplots(1,2, figsize=(12,6))
    residuals = modelname.resid
    fig = sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True, ax=ax[0])
    ax[0].set_title('QQ-Plot of Residuals')
    ax[1].scatter(modelname.predict(), modelname.resid)
    sns.set(font_scale = 1)
    ax[1].set_title('Homoscadasicity Assumption')
    plt.xlabel('Model Predictions')
    plt.ylabel('Model Residuals')
    ax[1].plot(modelname.predict(), [0 for i in range(len(df))], color = 'red')
    fig.tight_layout()
    return plt.show()
def check_for_assumptions_with_kde(modelname):
    rsquared = modelname.rsquared
    params = modelname.params
    print(f'Rsquared of Model: {rsquared}')
    print('----------')
    print('Beta values of Model:')
    print(params)
    rainbow_statistic, rainbow_p_value = linear_rainbow(modelname)
    print("Rainbow statistic:", rainbow_statistic)
    print("Rainbow p-value:", rainbow_p_value)
    fig, ax = plt.subplots(1,2, figsize=(12,6))
    residuals = modelname.resid
    sns.kdeplot(residuals, shade=True, ax=ax[0])
    ax[0].set_title('Normality Assumption of Residuals')
    ax[1].scatter(modelname.predict(), modelname.resid)
    sns.set(font_scale = 1)
    ax[1].set_title('Homoscadasicity Assumption')
    plt.xlabel('Model Predictions')
    plt.ylabel('Model Residuals')
    ax[1].plot(modelname.predict(), [0 for i in range(len(df))], color = 'red')
    fig.tight_layout()
    return plt.show()

In [3]:
xs = np.linspace(-4, 4, 200)
# use stats.t.pdf to get values on the probability density function for the t-distribution
# the second argument is the degrees of freedom
ys = stats.t.pdf(xs, len(df_porch)+len(df_noporch)-2, 0, 1)
t_crit = np.round(stats.t.ppf(1 - 0.05, df=len(df_porch)+len(df_noporch)-2),3)

fig = plt.figure(figsize=(15,8))
ax = fig.gca()

# plot the lines using matplotlib's plot function:
ax.plot(xs, ys, linewidth=3, color='darkblue', alpha=.75)


ax.axvline(t_crit,color='green',linestyle='--',lw=4,label='critical t-value', alpha=.75)
ax.axvline(test_result[0],color='red',linestyle='--',lw=4,label='test t-stat', alpha=.75)
ax.legend()
ax.fill_betweenx(ys,xs,t_crit,where= xs > t_crit)
plt.show()

In [None]:
def Cohen_d(group1, group2):
    diff = group1.mean() - group2.mean()

    n1, n2 = len(group1), len(group2)
    var1 = group1.var()
    var2 = group2.var()
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    
    d = diff / np.sqrt(pooled_var)
    
    return d

print("Effect Size for difference in Home Prices for the two groups (Cohen's d): ", Cohen_d(df_porch, df_noporch))

In [None]:
f, ax = plt.subplots(figsize=(12,6))
sns.kdeplot(df_porch, label='Homes with a Porch',shade=True, ax=ax)
sns.kdeplot(df_noporch, label='Homes without a Porch',shade=True, ax=ax)
ax.axvline(df_porch.mean(),color='blue',linestyle='--',lw=1,alpha=.75)
ax.axvline(df_noporch.mean(),color='green',linestyle='--',lw=1,alpha=.75)
plt.legend(fontsize='large')
plt.show();

In [None]:
# features to model in model 2
features2 = ['saleprice', 'sqrt_totlivingsqft', 'bathtotcount', 'sqrt_porch_size']
# create x-features:
x_feats2 = features2.copy()
x_feats2.remove('saleprice')
mf.heatmap_multi(x_feats2, df)