### Multiple Linear Regression on Boston dataset

#### Multiple Linear Regression on Boston dataset

#### Importing the required libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

#### Loading the data

In [2]:
boston = load_boston()

In [3]:
boston.data.shape

(506, 13)

In [4]:
X = pd.DataFrame(boston.data)

In [5]:
y = pd.DataFrame(boston.target)
y.columns = ['PRICE']

#### Inspecting the data

In [6]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
X.columns = boston.feature_names

In [8]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


In [10]:
y.head()

Unnamed: 0,PRICE
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [12]:
print(X_train.shape, X_test.shape)

(354, 13) (152, 13)


In [13]:
data = X_train.join(y_train)
corr_matrix = data.corr()
corr_matrix[['PRICE']].abs().sort_values(by = 'PRICE', ascending = False)

Unnamed: 0,PRICE
PRICE,1.0
LSTAT,0.743289
RM,0.708978
PTRATIO,0.564733
INDUS,0.504361
TAX,0.481319
NOX,0.42669
AGE,0.390942
RAD,0.389533
ZN,0.386115


In [14]:
corr_matrix_new = corr_matrix[['PRICE']].abs()
imp_var = corr_matrix_new[(corr_matrix_new['PRICE'] >= 0.35) & (corr_matrix_new['PRICE'] != 1)].index
not_imp_var = corr_matrix_new[(corr_matrix_new['PRICE'] < 0.35) & (corr_matrix_new['PRICE'] != 1)].index

In [15]:
imp_var

Index(['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO',
       'LSTAT'],
      dtype='object')

In [16]:
not_imp_var

Index(['CHAS', 'DIS', 'B'], dtype='object')

In [17]:
def back_var_sel(x_train, y_train):
    """a simple function to perform backward variable selection. 
    Input arguments are :  1) a dataframe with independent variables and 2) a df with target variable
    (basically x_train and y_train)2018-08-26 00:50:27 """
    imp_features = list(X_train.columns)
    not_imp_features = list()
    model = sm.OLS(y_train, sm.add_constant(x_train[imp_features])).fit()
    p_values = pd.DataFrame(model.pvalues[1:])

    while max(p_values[0]) > 0.05:
        not_imp_features.append(
            list(p_values[p_values[0] == max(p_values[0])].index)[0])
        imp_features = list(p_values[p_values[0] != max(p_values[0])].index)
        model = sm.OLS(y_train, sm.add_constant(x_train[imp_features])).fit()
        p_values = pd.DataFrame(round(model.pvalues[1:], 4))

    print('Important features are', imp_features, '\nRedundant features are',
          not_imp_features)
    return imp_features, not_imp_features,model

In [18]:
imp_features, not_imp_features,model = back_var_sel(X_train, y_train)

Important features are ['CRIM', 'ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] 
Redundant features are ['INDUS', 'AGE']


***

***

In [19]:
def variable_interactions(x) : 
    interactions = list()
    for i in x : 
        for j in x[x.index(i) + 1 : len(x)]: 
            interactions.append('%s:%s' %(i,j))
    return interactions

In [20]:
interactions = variable_interactions(imp_features)

In [21]:
def into_formula(target_var, interactions) :
    formula = ''
    for inter in interactions : 
        if len(formula) == 0 : 
            formula = '%s ~ %s' % (target_var, inter)
        else : 
            formula = formula + ' + %s' %inter
    return formula

In [22]:
formula = into_formula('PRICE', interactions)

In [23]:
formula

'PRICE ~ CRIM:ZN + CRIM:CHAS + CRIM:NOX + CRIM:RM + CRIM:DIS + CRIM:RAD + CRIM:TAX + CRIM:PTRATIO + CRIM:B + CRIM:LSTAT + ZN:CHAS + ZN:NOX + ZN:RM + ZN:DIS + ZN:RAD + ZN:TAX + ZN:PTRATIO + ZN:B + ZN:LSTAT + CHAS:NOX + CHAS:RM + CHAS:DIS + CHAS:RAD + CHAS:TAX + CHAS:PTRATIO + CHAS:B + CHAS:LSTAT + NOX:RM + NOX:DIS + NOX:RAD + NOX:TAX + NOX:PTRATIO + NOX:B + NOX:LSTAT + RM:DIS + RM:RAD + RM:TAX + RM:PTRATIO + RM:B + RM:LSTAT + DIS:RAD + DIS:TAX + DIS:PTRATIO + DIS:B + DIS:LSTAT + RAD:TAX + RAD:PTRATIO + RAD:B + RAD:LSTAT + TAX:PTRATIO + TAX:B + TAX:LSTAT + PTRATIO:B + PTRATIO:LSTAT + B:LSTAT'

In [24]:
X_train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
141,1.62864,0.0,21.89,0.0,0.624,5.019,100.0,1.4394,4.0,437.0,21.2,396.9,34.41
272,0.1146,20.0,6.96,0.0,0.464,6.538,58.7,3.9175,3.0,223.0,18.6,394.96,7.73
135,0.55778,0.0,21.89,0.0,0.624,6.335,98.2,2.1107,4.0,437.0,21.2,394.67,16.96
298,0.06466,70.0,2.24,0.0,0.4,6.345,20.1,7.8278,5.0,358.0,14.8,368.24,4.97
122,0.09299,0.0,25.65,0.0,0.581,5.961,92.9,2.0869,2.0,188.0,19.1,378.09,17.93


In [25]:
y_train.head()

Unnamed: 0,PRICE
141,14.4
272,24.4
135,18.1
298,22.5
122,20.5


In [26]:
X_train.join(y_train)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
141,1.62864,0.0,21.89,0.0,0.6240,5.019,100.0,1.4394,4.0,437.0,21.2,396.90,34.41,14.4
272,0.11460,20.0,6.96,0.0,0.4640,6.538,58.7,3.9175,3.0,223.0,18.6,394.96,7.73,24.4
135,0.55778,0.0,21.89,0.0,0.6240,6.335,98.2,2.1107,4.0,437.0,21.2,394.67,16.96,18.1
298,0.06466,70.0,2.24,0.0,0.4000,6.345,20.1,7.8278,5.0,358.0,14.8,368.24,4.97,22.5
122,0.09299,0.0,25.65,0.0,0.5810,5.961,92.9,2.0869,2.0,188.0,19.1,378.09,17.93,20.5
22,1.23247,0.0,8.14,0.0,0.5380,6.142,91.7,3.9769,4.0,307.0,21.0,396.90,18.72,15.2
68,0.13554,12.5,6.07,0.0,0.4090,5.594,36.8,6.4980,4.0,345.0,18.9,396.90,13.09,17.4
20,1.25179,0.0,8.14,0.0,0.5380,5.570,98.1,3.7979,4.0,307.0,21.0,376.57,21.02,13.6
437,15.17720,0.0,18.10,0.0,0.7400,6.152,100.0,1.9142,24.0,666.0,20.2,9.32,26.45,8.7
14,0.63796,0.0,8.14,0.0,0.5380,6.096,84.5,4.4619,4.0,307.0,21.0,380.02,10.26,18.2


In [27]:
import statsmodels.formula.api as smf

In [28]:
mf = smf.ols(formula = 'PRICE ~%s' % b, data = X_train.join(y_train)).fit()

NameError: name 'b' is not defined

In [29]:
b

NameError: name 'b' is not defined

In [30]:
imp_features = list(X_train.join(y_train).columns)

In [31]:
imp_features

['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT',
 'PRICE']

In [32]:
print(mf.summary())

NameError: name 'mf' is not defined

In [201]:
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [535]:
vif

Unnamed: 0,VIF Factor,features
0,585.42521,const
1,1.773321,CRIM
2,2.298641,ZN
3,3.991194,INDUS
4,1.073943,CHAS
5,4.395064,NOX
6,1.934161,RM
7,3.10086,AGE
8,3.956551,DIS
9,7.480539,RAD


### In-sample predictions & calculating the metrics manually

In [467]:
y_train_pred = m.predict(X_train[imp_features])

Calculating $R^2$

In [468]:
print(y_train.shape, y_train_pred.shape)

(354, 1) (354,)


In [469]:
y_train.head()

Unnamed: 0,PRICE
141,14.4
272,24.4
135,18.1
298,22.5
122,20.5


In [470]:
y_train_pred.columns = ['PRED_PRICE']

In [472]:
y_train_pred.head()

141     4.432201
272    28.395465
135    17.263390
298    29.404862
122    20.423404
dtype: float64

In [474]:
y_train['PRED_PRICE'] = y_train_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [475]:
y_train.head()

Unnamed: 0,PRICE,PRED_PRICE
141,14.4,4.432201
272,24.4,28.395465
135,18.1,17.26339
298,22.5,29.404862
122,20.5,20.423404


#### $R^2$ using pearson correlation

In [476]:
print('Manually calculated R^2 from pearson correlation for train_data is', 
      round(np.square(y_train.corr()[['PRICE']]).iloc[1,0], 3))

Manually calculated R^2 from pearson correlation for train_data is 0.764


#### $R^2$ from model summary

In [478]:
round(m.rsquared, 3)

0.764

#### $R^2$ using formula

In [485]:
1 - sum((y_train['PRICE'] - y_train['PRED_PRICE'])**2) / sum((y_train['PRICE'] - np.mean(y_train['PRICE']))**2)

0.7641573206614352

In [None]:
y_test_pred = m.predict(X_test[imp_features])

In [None]:
y_test['PRED_PRICE'] = y_test_pred

In [None]:
y_test.head()

In [None]:
print('Manually calculated R^2 from pearson correlation for test_data is', 
      round(np.square(y_test.corr()[['PRICE']]).iloc[1,0], 3))

In [None]:
m.rsquared

In [None]:
m_test = sm.OLS(y_test, X_test[imp_features]).fit()

In [None]:
def get_predictions( y_test, model ):
    y_pred_df = pd.DataFrame( { 'actual': y_test,
                               "predicted_prob": lg.predict( sm.add_constant( x_test ) ) } )
    return y_pred_df

In [203]:
data = X.join(y)

In [204]:
data.head()

Unnamed: 0,const,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,1.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,1.0,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,1.0,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,1.0,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,1.0,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [205]:
corr_matrix = data.corr()
corr_matrix[['PRICE']].abs().sort_values(by = 'PRICE', ascending = False)

Unnamed: 0,PRICE
PRICE,1.0
LSTAT,0.737663
RM,0.69536
PTRATIO,0.507787
INDUS,0.483725
TAX,0.468536
NOX,0.427321
CRIM,0.385832
RAD,0.381626
AGE,0.376955


In [None]:
p_values[p_values[0] > 0.05].index

In [531]:
a = corr_matrix[['PRICE']].abs()
a[a['PRICE'] > 0.35].index

Index(['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO',
       'LSTAT', 'PRICE'],
      dtype='object')

In [511]:
corr_matrix.drop('PRICE', axis = 1)[corr_matrix.drop('PRICE', axis = 1) > 0.5][corr_matrix.drop('PRICE', axis = 1) != 1]

Unnamed: 0,const,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
const,,,,,,,,,,,,,,
CRIM,,,,,,,,,,0.622029,0.579564,,,
ZN,,,,,,,,,0.664408,,,,,
INDUS,,,,,,0.763651,,0.644779,,0.595129,0.72076,,,0.6038
CHAS,,,,,,,,,,,,,,
NOX,,,,0.763651,,,,0.73147,,0.611441,0.668023,,,0.590879
RM,,,,,,,,,,,,,,
AGE,,,,0.644779,,0.73147,,,,,0.506456,,,0.602339
DIS,,,0.664408,,,,,,,,,,,
RAD,,0.622029,,0.595129,,0.611441,,,,,0.910228,,,


In [207]:
imp_features

['CRIM', 'CHAS', 'RM', 'PTRATIO', 'B', 'LSTAT']

In [208]:
nimp_features

['NOX', 'ZN', 'AGE', 'RAD', 'TAX', 'const', 'DIS', 'INDUS']

In [228]:
corr_matrix[corr_matrix.abs() > 0.5]

Unnamed: 0,const,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
const,,,,,,,,,,,,,,,
CRIM,,1.0,,,,,,,,0.622029,0.579564,,,,
ZN,,,1.0,-0.533828,,-0.516604,,-0.569537,0.664408,,,,,,
INDUS,,,-0.533828,1.0,,0.763651,,0.644779,-0.708027,0.595129,0.72076,,,0.6038,
CHAS,,,,,1.0,,,,,,,,,,
NOX,,,-0.516604,0.763651,,1.0,,0.73147,-0.76923,0.611441,0.668023,,,0.590879,
RM,,,,,,,1.0,,,,,,,-0.613808,0.69536
AGE,,,-0.569537,0.644779,,0.73147,,1.0,-0.747881,,0.506456,,,0.602339,
DIS,,,0.664408,-0.708027,,-0.76923,,-0.747881,1.0,,-0.534432,,,,
RAD,,0.622029,,0.595129,,0.611441,,,,1.0,0.910228,,,,


In [547]:
imp_features.pop(6)
imp_features

['const', 'CRIM', 'RM', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']

In [229]:
mm = sm.OLS(y_train, X_train[['CRIM', 'CHAS', 'PTRATIO', 'B', 'LSTAT']]).fit()

In [230]:
print(mm.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.903
Model:                            OLS   Adj. R-squared:                  0.902
Method:                 Least Squares   F-statistic:                     649.7
Date:                Thu, 23 Aug 2018   Prob (F-statistic):          2.37e-174
Time:                        23:54:53   Log-Likelihood:                -1222.2
No. Observations:                 354   AIC:                             2454.
Df Residuals:                     349   BIC:                             2474.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
CRIM          -0.0714      0.055     -1.288      0.1

In [214]:
m_test = sm.OLS(y_test, X_test[imp_features]).fit()

In [216]:
print(m_test.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     459.5
Date:                Thu, 23 Aug 2018   Prob (F-statistic):           4.09e-92
Time:                        23:38:11   Log-Likelihood:                -470.58
No. Observations:                 152   AIC:                             953.2
Df Residuals:                     146   BIC:                             971.3
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
CRIM           0.0558      0.063      0.887      0.3

In [439]:
y_test.head()

Unnamed: 0,PRICE
329,22.6
371,50.0
219,23.0
403,8.3
78,21.2


In [440]:
y_pred_df = pd.DataFrame({'actual' : y_test['PRICE'], 'predicted' : m.predict(X_test[features])})

In [441]:
y_pred_df.head()

Unnamed: 0,actual,predicted
329,22.6,24.686849
371,50.0,23.930927
219,23.0,29.486155
403,8.3,12.055391
78,21.2,21.311147


In [442]:
1 - sum((y_pred_df['actual'] - y_pred_df['predicted'])**2) / sum((y_pred_df['actual'] - np.mean(y_pred_df['actual']))**2)

0.6752825042482071

In [262]:
import os
os.getcwd()

'C:\\Users\\sathwikkoushal\\Desktop\\ML_P'

In [280]:
data = pd.read_csv('Heart.csv', header = 0, index_col = 0)

In [281]:
data['AHD'] = data.AHD.replace(['Yes', 'No'], [1, 0])

In [445]:
y = data.dropna()['AHD']

In [291]:
y.shape

(297,)

In [269]:
heart = pd.read_csv('Heart.csv', header = 0, index_col = 0)

In [276]:
print(heart.shape)

(297, 20)


In [271]:
heart = heart.dropna()

In [275]:
heart.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD,Sex_Cat,Fbs_Cat,RestECG_Cat,ExAng_Cat,Slope_Cat,Ca_Cat
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,0,1,1,2,0,3,0.0
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,1,1,0,2,1,2,3.0
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,1,1,0,2,1,2,2.0
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,0,1,0,0,0,3,0.0
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,0,0,0,2,0,1,0.0


In [273]:
heart['Sex_Cat']     = heart.Sex.astype("category")
heart['Fbs_Cat']     = heart.Fbs.astype("category")
heart['RestECG_Cat'] = heart.RestECG.astype("category")
heart['ExAng_Cat']   = heart.ExAng.astype("category")
heart['Slope_Cat']   = heart.Slope.astype("category")
heart['Ca_Cat']      = heart.Ca.astype("category")

In [274]:
heart['AHD'] = heart.AHD.replace(['Yes', 'No'], [1, 0])

In [303]:
X_features = list(heart.columns )
X_features.remove('AHD')
X_features.remove('Sex')
X_features.remove('Fbs')
X_features.remove('RestECG')
X_features.remove('ExAng')
X_features.remove('Slope')
X_features.remove('Ca')
X_features

ValueError: list.remove(x): x not in list

In [459]:
X_features

['Age',
 'RestBP',
 'Chol',
 'MaxHR',
 'Oldpeak',
 'ChestPain_nonanginal',
 'ChestPain_nontypical',
 'ChestPain_typical',
 'Thal_normal',
 'Thal_reversable',
 'Sex_Cat_1',
 'Fbs_Cat_1',
 'RestECG_Cat_1',
 'RestECG_Cat_2',
 'ExAng_Cat_1',
 'Slope_Cat_2',
 'Slope_Cat_3',
 'Ca_Cat_1.0',
 'Ca_Cat_2.0',
 'Ca_Cat_3.0']

In [278]:
heart = pd.get_dummies(heart[X_features], drop_first = True)

In [279]:
len(heart.columns)

20

In [283]:
x = heart

In [460]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.3, random_state = 56 )

In [529]:
logit = sm.Logit( y_train, sm.add_constant( x_train ) )
lg = logit.fit()

Optimization terminated successfully.
         Current function value: 0.239217
         Iterations 9


In [530]:
from scipy import stats
stats.chisqprob = lambda chisq, df : stats.chi2.sf(chisq, df)

In [531]:
print(lg.summary())

                           Logit Regression Results                           
Dep. Variable:                    AHD   No. Observations:                  207
Model:                          Logit   Df Residuals:                      186
Method:                           MLE   Df Model:                           20
Date:                Fri, 24 Aug 2018   Pseudo R-squ.:                  0.6539
Time:                        02:54:33   Log-Likelihood:                -49.518
converged:                       True   LL-Null:                       -143.07
                                        LLR p-value:                 3.915e-29
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                  -10.1254      4.180     -2.422      0.015     -18.319      -1.932
Age                     -0.0081      0.033     -0.242      0.809      -0.074       0.057
RestBP      

In [451]:
x = sm.add_constant(x)

In [532]:
x_features = list(x_train.columns)

In [533]:
p_values = pd.DataFrame(lg.pvalues)

In [534]:
p_values

Unnamed: 0,0
const,0.01543
Age,0.808922
RestBP,0.00217
Chol,0.439231
MaxHR,0.193643
Oldpeak,0.026501
ChestPain_nonanginal,0.000118
ChestPain_nontypical,0.183532
ChestPain_typical,0.000258
Thal_normal,0.277325


In [535]:
n_features = list()

In [536]:
while max(p_values[0]) > 0.05 :
    n_features.append(list(p_values[p_values[0] == max(p_values[0])].index)[0])
    x_features = list(p_values[p_values[0] != max(p_values[0])].index)
    temp_df = x_train[x_features]
    logit = sm.Logit(y_train, temp_df)
    lg = logit.fit()
    p_values = pd.DataFrame(round(lg.pvalues, 4))

print('Important features are', x_features, '\nRedundant features are', n_features)

Optimization terminated successfully.
         Current function value: 0.239225
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.239371
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.239589
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.240799
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.242396
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.244685
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.249158
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.254841
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.261389
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.269155
  

In [537]:
print(lg.summary())

                           Logit Regression Results                           
Dep. Variable:                    AHD   No. Observations:                  207
Model:                          Logit   Df Residuals:                      196
Method:                           MLE   Df Model:                           10
Date:                Fri, 24 Aug 2018   Pseudo R-squ.:                  0.6106
Time:                        02:56:16   Log-Likelihood:                -55.715
converged:                       True   LL-Null:                       -143.07
                                        LLR p-value:                 2.925e-32
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -8.6249      2.260     -3.816      0.000     -13.055      -4.195
RestBP                   0.0397      0.015      2.565      0.010       0.009       0.070
Oldpeak     

In [307]:
imp_features

['Oldpeak',
 'ChestPain_nonanginal',
 'ChestPain_nontypical',
 'ChestPain_typical',
 'Thal_reversable',
 'Ca_Cat_1.0']

In [308]:
nimp_features

['Age',
 'RestBP',
 'Chol',
 'MaxHR',
 'Thal_normal',
 'Sex_Cat_1',
 'Fbs_Cat_1',
 'RestECG_Cat_1',
 'RestECG_Cat_2',
 'ExAng_Cat_1',
 'Slope_Cat_2',
 'Slope_Cat_3',
 'Ca_Cat_2.0',
 'Ca_Cat_3.0']

In [309]:
print(lg1.summary())

                           Logit Regression Results                           
Dep. Variable:                    AHD   No. Observations:                  207
Model:                          Logit   Df Residuals:                      201
Method:                           MLE   Df Model:                            5
Date:                Fri, 24 Aug 2018   Pseudo R-squ.:                  0.4744
Time:                        01:07:26   Log-Likelihood:                -75.199
converged:                       True   LL-Null:                       -143.07
                                        LLR p-value:                 1.432e-27
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Oldpeak                  0.7774      0.212      3.675      0.000       0.363       1.192
ChestPain_nonanginal    -3.4568      0.546     -6.337      0.000      -4.526      -2.388
ChestPain_no