In [6]:
#IMPORTS
from matplotlib import pyplot as plt
import statsmodels.api as sm
import scipy

In [8]:
#LINEARITY FUNCTION
# THE RELATIONSHIP BETWEEN THE TARGET AND PREDICTOR SHOULD BE LINEAR
def linearity_test(endog, exog):
    predictions = sm.OLS(endog=endog, exog=exog).fit().predict(exog)
    residuals = endog - predictions
    fig, ax = plt.subplots()
    ax.scatter(predictions, residuals)
    ax.set_xlabel('predicted Y')
    ax.set_ylabel('residual values')
    plt.suptitle('Residuals Vs. Predictions');

In [9]:
# INDEPENDENCE FUNCTION
def independence_dest_DW(endog,exog):
    ''' POarameters Taken: endog, exog
        Returns: the durbin Watson Test Statistic
        Details: The Durbin-Watson test statistic is calculated on the null hypothesis that there is no correlation among the errors. 
        The test statistic has a range of 0 to 4, where 2 indicates no correlation, a score less than 2 indicates a positive correlation, and a score greater than 2 indicates a negative correlation.
    '''
    predictions = sm.OLS(endog=endog, exog=exog).fit().predict(exog)
    residuals = endog - predictions
    dw = sm.stats.stattools.durbin_watson
    return dw(residuals)

In [10]:
def normality_test(endog,exog):
    ''' Checking that the errors are normally distibuted.
    The test statistic of the Jarque-Bera test is always a positive number and if it’s far from zero, it indicates that the sample data do not have a normal distribution.
    Omnibus P value : A minimum value of 1000 is recommended. Multiple-testing correction provides Bonferroni correction and false discovery rate (FDR). A significance level must be specified in the significance level (a).
    '''
    model = sm.OLS(endog=endog, exog=exog).fit()
    return model.summary().tables[2]

In [11]:
def homoskedasticity_test(endog,exog):
    ''' Often when errors are heteroskedastic they will be greater for greater values of the target. If the target has an exponential distribution, with lots of small values and few large values, then the model will tend to focus on the smaller values in calculating its betas, producing volatility for the higher end of the spectrum. And so we'll see greater divergence in the errors for larger values of the target.
    
    '''
    predictions = sm.OLS(endog=endog, exog=exog).fit().predict(exog)
    residuals = endog - predictions
    fig, ax = plt.subplots()
    ax.scatter(endog, residuals)
    ax.set_xlabel('Y values')
    ax.set_ylabel('Residual Values')
    plt.suptitle('Residuals Vs. Predictions');
    
    