## Check Normality Assumption

In [None]:
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import scipy.stats as stats


def jarque_bera(model):
    name = ['Jarque-Bera','Prob','Skew', 'Kurtosis']
    test = sms.jarque_bera(model.resid)
    output = list(zip(name, test))
    sns.distplot(model.resid, kde=False)
    return output

def qq_plot(model):
    fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='45', fit=True)

In [None]:
qq_plot(model)

In [None]:
jarque_bera(model)

## Check For Homoskedasticity

**This assumes normality!!**

In [None]:
test_data = df.drop(['date', 'id'], axis=1)

def gq_test(data, model):
    
    #Graph heteroskedasticity
    pred_val = model.fittedvalues.copy()
    true_val = data['price'].values.copy()
    residual = true_val - pred_val
    
    fig, ax = plt.subplots(figsize=(6,2.5))
    graph = ax.scatter(residual, pred_val)
    
    # Run Goldfeld Quandt test
    name = ['F statistic', 'p-value']
    test = sms.het_goldfeldquandt(model.resid, model.model.exog)
    return list(zip(name, test))

In [None]:
#Cannot reject the null hypothesis of homoscedasticity
gq_test(test_data, model)

## Check For Multicollinearity

In [None]:
copy_df = df
# save absolute value of correlation matrix as a data frame
# converts all values to absolute value
# stacks the row:column pairs into a multindex
# reset the index to set the multindex to seperate columns
# sort values. 0 is the column automatically generated by the stacking

pair_df=copy_df.corr().abs().stack().reset_index().sort_values(0, ascending=False)

# zip the variable name columns (Which were only named level_0 and level_1 by default) in a new column named "pairs"
pair_df['pairs'] = list(zip(pair_df.level_0, pair_df.level_1))

# set index to pairs
pair_df.set_index(['pairs'], inplace = True)

#d rop level columns
pair_df.drop(columns=['level_1', 'level_0'], inplace = True)

# rename correlation column as cc rather than 0
pair_df.columns = ['cc']

# drop duplicates. This could be dangerous if you have variables perfectly correlated with variables other than themselves.
# for the sake of exercise, kept it in.
# df.drop_duplicates(inplace=True)
pair_df[(pair_df.cc>.75) & (pair_df.cc <1)]