<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-the-modules" data-toc-modified-id="Import-the-modules-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import the modules</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Statsmodels-linear-regression" data-toc-modified-id="Statsmodels-linear-regression-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Statsmodels linear regression</a></span></li><li><span><a href="#Statsmodels-logistic-regression" data-toc-modified-id="Statsmodels-logistic-regression-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Statsmodels logistic regression</a></span></li><li><span><a href="#Get-pandas-dataframe-from-summary-tables" data-toc-modified-id="Get-pandas-dataframe-from-summary-tables-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Get pandas dataframe from summary tables</a></span></li><li><span><a href="#Detail-explanation-of-Summary" data-toc-modified-id="Detail-explanation-of-Summary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Detail explanation of Summary</a></span></li><li><span><a href="#Time-Taken" data-toc-modified-id="Time-Taken-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Time Taken</a></span></li></ul></div>

# Import the modules

In [1]:
import time
time_start_notebook = time.time()

In [2]:
import numpy as np
import pandas as pd
import os,sys,time
import io

import copy
import inspect

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import statsmodels
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import statsmodels.robust as smrb # smrb.mad() etc
import patsy # y,X1 = patsy.dmatrices(formula, df, return_type='dataframe')


from IPython.display import display, HTML
# settings
SEED = 100
pd.options.display.max_columns = 100

%matplotlib inline
%load_ext watermark
%watermark -iv

pandas     : 1.5.3
seaborn    : 0.13.0
patsy      : 0.5.6
matplotlib : 3.8.2
statsmodels: 0.14.0
numpy      : 1.26.3
sys        : 3.11.7 (tags/v3.11.7:fa7a6f2, Dec  4 2023, 19:24:49) [MSC v.1937 64 bit (AMD64)]



In [3]:
# my local library
import sys
from pathlib import Path
import platform

if platform.system() == 'Windows':
    p = Path("~/OneDrive - AmerisourceBergen(ABC)/bhishan").expanduser()
elif platform.system() == 'Darwin':
    p = Path.home() / "Dropbox/a00_Bhishan_Modules"
sys.path.append(str(p))
from bhishan import bp

# Load the data

In [4]:
df = sns.load_dataset('titanic')
df['sex'] = df['sex'].map({'male':1,'female':2})
df['alone'] = df['alone'].astype(int)
df['adult_male'] = df['adult_male'].astype(int)

df = df.drop(['embarked','class','who','deck','embark_town','alive'],axis=1)
df = df.dropna()

target = 'survived'
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alone
0,0,3,1,22.0,1,0,7.25,1,0
1,1,1,2,38.0,1,0,71.2833,0,0
2,1,3,2,26.0,0,0,7.925,0,1
3,1,1,2,35.0,1,0,53.1,0,0
4,0,3,1,35.0,0,0,8.05,1,1


In [5]:
df[target].value_counts()

0    424
1    290
Name: survived, dtype: int64

# Statsmodels linear regression

In [6]:
help(bp.print_statsmodels_summary)

Help on function print_statsmodels_summary in module bhishan.ml_statsmodels:

print_statsmodels_summary(summary: Any, verbose: bool = True, topn: Optional[int] = None, firstn: Optional[int] = None, sort_col: Union[int, str, NoneType] = None, smallp: float = 1.0, show_const: bool = True, cmap_coef: str = 'Blues', cmap_p: str = 'Reds')
    Print statsmodels fitted model summary with some color hightlights.
    
    Parameters
    -----------
    summary: object
        Statsmodel fitted model summary.
    verbose: bool
        Whether or not to print statistic description.
    topn: int
        Top n sorted features based on p-values.
    firstn: int
        Number of first n features to show in summary.
    smallp: float
        Show features that have p-values smaller than this in summary.
    show_const: bool
        Whether or not to show constant statistics.
    cmap_coef: str
        Color map for coefficients.
    cmap_p: str
        Color map for p-values.
    
    Example
    --

In [7]:
model = sm.OLS(df['age'],
               df.drop('age',axis=1).assign(const=1)
              )

model_fit = model.fit()
summary = model_fit.summary()

bp.print_statsmodels_summary(summary,verbose=False,smallp=0.05,sort_col='coef')

0,1,2,3
Dep. Variable:,age,R-squared:,0.342
Model:,OLS,Adj. R-squared:,0.334
Method:,Least Squares,F-statistic:,45.76
Date:,"Thu, 14 Mar 2024",Prob (F-statistic):,2.8600000000000003e-59
Time:,13:20:36,Log-Likelihood:,-2774.0
No. Observations:,714,AIC:,5566.0
Df Residuals:,705,BIC:,5607.0
Df Model:,8,,
Covariance Type:,nonrobust,,

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
survived,-3.5816,1.205,-2.972,0.003,-5.948,-1.216

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
pclass,-7.619,0.69,-11.048,0.0,-8.973,-6.265
const,13.1161,5.118,2.563,0.011,3.067,23.165
sex,16.9075,2.223,7.606,0.0,12.543,21.272
adult_male,20.6826,2.396,8.632,0.0,15.978,25.387

0,1,2,3
Omnibus:,32.68,Durbin-Watson:,1.889
Prob(Omnibus):,0.0,Jarque-Bera (JB):,35.963
Skew:,0.529,Prob(JB):,1.55e-08
Kurtosis:,3.301,Cond. No.,847.0


# Statsmodels logistic regression

In [8]:
model = sm.Logit(df['survived'],df.drop('survived',axis=1).assign(const=1))

model_fit = model.fit()
summary = model_fit.summary()

bp.print_statsmodels_summary(summary,verbose=False,smallp=0.05,sort_col='coef')

Optimization terminated successfully.
         Current function value: 0.418684
         Iterations 6


0,1,2,3
Dep. Variable:,survived,No. Observations:,714.0
Model:,Logit,Df Residuals:,705.0
Method:,MLE,Df Model:,8.0
Date:,"Thu, 14 Mar 2024",Pseudo R-squ.:,0.3801
Time:,13:20:36,Log-Likelihood:,-298.94
converged:,True,LL-Null:,-482.26
Covariance Type:,nonrobust,LLR p-value:,2.539e-74

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
pclass,-1.1542,0.173,-6.684,0.0,-1.493,-0.816

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
adult_male,-3.0302,0.546,-5.546,0.0,-4.101,-1.959
sibsp,-0.6676,0.155,-4.308,0.0,-0.971,-0.364
parch,-0.3095,0.148,-2.09,0.037,-0.6,-0.019
age,-0.0248,0.009,-2.806,0.005,-0.042,-0.007
const,5.1283,1.106,4.637,0.0,2.961,7.296


In [9]:
bp.show_methods(model_fit,5)

Unnamed: 0,0,1,2,3,4
0,aic,fittedvalues,llr,predict,score_test
1,bic,get_distribution,llr_pvalue,prsquared,set_null_options
2,bse,get_influence,load,pvalues,summary
3,conf_int,get_margeff,method,remove_data,summary2
4,converged,get_prediction,mle_retvals,resid_dev,t_test
5,cov_kwds,im_ratio,mle_settings,resid_generalized,t_test_pairwise
6,cov_params,info_criteria,model,resid_pearson,tvalues
7,cov_type,initialize,nobs,resid_response,use_t
8,df_model,k_constant,normalized_cov_params,save,wald_test
9,df_resid,llf,params,scale,wald_test_terms


In [10]:
bp.show_methods(summary)

Unnamed: 0,0,1,2
0,add_extra_txt,as_csv,as_text
1,add_table_2cols,as_html,extra_txt
2,add_table_params,as_latex,tables


# Get pandas dataframe from summary tables

In [11]:
summary

0,1,2,3
Dep. Variable:,survived,No. Observations:,714.0
Model:,Logit,Df Residuals:,705.0
Method:,MLE,Df Model:,8.0
Date:,"Thu, 14 Mar 2024",Pseudo R-squ.:,0.3801
Time:,13:20:36,Log-Likelihood:,-298.94
converged:,True,LL-Null:,-482.26
Covariance Type:,nonrobust,LLR p-value:,2.539e-74

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pclass,-1.1542,0.173,-6.684,0.000,-1.493,-0.816
sex,0.0356,0.505,0.071,0.944,-0.954,1.025
age,-0.0248,0.009,-2.806,0.005,-0.042,-0.007
sibsp,-0.6676,0.155,-4.308,0.000,-0.971,-0.364
parch,-0.3095,0.148,-2.090,0.037,-0.600,-0.019
fare,0.0030,0.003,1.131,0.258,-0.002,0.008
adult_male,-3.0302,0.546,-5.546,0.000,-4.101,-1.959
alone,-0.4593,0.313,-1.468,0.142,-1.072,0.154
const,5.1283,1.106,4.637,0.000,2.961,7.296


In [12]:
summary.tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pclass,-1.1542,0.173,-6.684,0.000,-1.493,-0.816
sex,0.0356,0.505,0.071,0.944,-0.954,1.025
age,-0.0248,0.009,-2.806,0.005,-0.042,-0.007
sibsp,-0.6676,0.155,-4.308,0.000,-0.971,-0.364
parch,-0.3095,0.148,-2.090,0.037,-0.600,-0.019
fare,0.0030,0.003,1.131,0.258,-0.002,0.008
adult_male,-3.0302,0.546,-5.546,0.000,-4.101,-1.959
alone,-0.4593,0.313,-1.468,0.142,-1.072,0.154
const,5.1283,1.106,4.637,0.000,2.961,7.296


In [13]:
df_summary = pd.read_csv(io.StringIO(summary.tables[1].as_csv()),index_col=0)
df_summary

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
,,,,,,
pclass,-1.1542,0.173,-6.684,0.0,-1.493,-0.816
sex,0.0356,0.505,0.071,0.944,-0.954,1.025
age,-0.0248,0.009,-2.806,0.005,-0.042,-0.007
sibsp,-0.6676,0.155,-4.308,0.0,-0.971,-0.364
parch,-0.3095,0.148,-2.09,0.037,-0.6,-0.019
fare,0.003,0.003,1.131,0.258,-0.002,0.008
adult_male,-3.0302,0.546,-5.546,0.0,-4.101,-1.959
alone,-0.4593,0.313,-1.468,0.142,-1.072,0.154
const,5.1283,1.106,4.637,0.0,2.961,7.296


# Detail explanation of Summary

In [14]:
bp.print_statsmodels_summary(summary,verbose=True,smallp=0.05,sort_col='coef')

0,1,2,3
Dep. Variable:,survived,No. Observations:,714.0
Model:,Logit,Df Residuals:,705.0
Method:,MLE,Df Model:,8.0
Date:,"Thu, 14 Mar 2024",Pseudo R-squ.:,0.3801
Time:,13:20:36,Log-Likelihood:,-298.94
converged:,True,LL-Null:,-482.26
Covariance Type:,nonrobust,LLR p-value:,2.539e-74

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
pclass,-1.1542,0.173,-6.684,0.0,-1.493,-0.816

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
adult_male,-3.0302,0.546,-5.546,0.0,-4.101,-1.959
sibsp,-0.6676,0.155,-4.308,0.0,-0.971,-0.364
parch,-0.3095,0.148,-2.09,0.037,-0.6,-0.019
age,-0.0248,0.009,-2.806,0.005,-0.042,-0.007
const,5.1283,1.106,4.637,0.0,2.961,7.296


In [15]:
help(sm.OLS)

Help on class OLS in module statsmodels.regression.linear_model:

class OLS(WLS)
 |  OLS(endog, exog=None, missing='none', hasconst=None, **kwargs)
 |  
 |  Ordinary Least Squares
 |  
 |  Parameters
 |  ----------
 |  endog : array_like
 |      A 1-d endogenous response variable. The dependent variable.
 |  exog : array_like
 |      A nobs x k array where `nobs` is the number of observations and `k`
 |      is the number of regressors. An intercept is not included by default
 |      and should be added by the user. See
 |      :func:`statsmodels.tools.add_constant`.
 |  missing : str
 |      Available options are 'none', 'drop', and 'raise'. If 'none', no nan
 |      checking is done. If 'drop', any observations with nans are dropped.
 |      If 'raise', an error is raised. Default is 'none'.
 |  hasconst : None or bool
 |      Indicates whether the RHS includes a user-supplied constant. If True,
 |      a constant is not checked for and k_constant is set to 1 and all
 |      result s

# Time Taken

In [16]:
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 3 secs
