In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy

def model_selection(data,response):
    
    """Model developed for setpwise linear model selection
    here: 
        data:     pandas dataframe with dependent and independent variables
        response: Column name of dependent variable in data
    
    selection criteria is based on f-test statistics
    
    Author : Bikas C. Bhattarai
    """
    
    corr = (np.sqrt(data.corr()**2)[response]).to_frame()
    matrix = (corr.sort_values(by = response, ascending= False))
    mat_ind = matrix.reset_index()
    mat_ind['P1'] = [response] * len(matrix)
    mat_ind_arr = (mat_ind[['P1','index',response]])[1:]

    for i in np.arange(1,len(mat_ind_arr.columns)-1,1):
        N = len(data)
        conn = '+'
        sig = '~'
        formula = mat_ind_arr['P1'][1] + sig + mat_ind_arr['index'][i]
        model = (smf.ols(formula = formula, data = data).fit())
        n1 = model.df_model
        r1 = model.rsquared

        formula1 = formula +conn + mat_ind_arr['index'][i+1]
        model1 = (smf.ols(formula = formula1, data = data).fit())
        n2 = model1.df_model
        r2 = model1.rsquared
        f_test = scipy.stats.f.ppf(q=1-0.05, dfn=N-n2-1, dfd=N-n2-2)
        f_crt = ((1-r1)*(N-n2-1))/((1-r2)*(N-n2-2))
        if f_crt>f_test:
            optimal = formula1
    
    return ((smf.ols(formula = optimal,data = data).fit()).summary())
