In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
from statsmodels.stats.proportion import proportion_confint
from itertools import product

## Importing Data

In [None]:
person_df = pd.read_pickle('../../edited_fund_person_data.pkl')
person_df

In [3]:
province_id = pd.read_excel('../../diabeteschronicabuserprojectihio/Test_sample/Test/province_id.xlsx').set_index('province_id_ihio')
province_id.head()

Unnamed: 0_level_0,province_id_iso,province_name_en,province_name_fa
province_id_ihio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26,0,Markazi,مركزي
1,1,Gilan,گيلان
2,2,Mazandaran,مازندران
3,3,"Azerbaijan, East",آذربايجان شرقي
4,4,"Azerbaijan, West",آذربايجان غربي


## Building Empty Table

In [4]:
person_df['age_cat'].cat.categories

IntervalIndex([[18, 39], [40, 64], [65, 95]], dtype='interval[int64, both]')

In [5]:
province_id.loc[[23, 27, 9, 10]]

Unnamed: 0_level_0,province_id_iso,province_name_en,province_name_fa
province_id_ihio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23,23,Tehran,تهران
27,26,Qazvin,قزوين
9,8,Kerman,كرمان
10,10,Isfahan,اصفهان


In [6]:
idx_list = []
idx_list.append(pd.MultiIndex.from_product([['age_cat'], person_df['age_cat'].cat.categories]))
idx_list.append(pd.MultiIndex.from_product([['gender'], person_df['gender'].unique()]))
idx_list.append(pd.MultiIndex.from_product([['fund'], person_df['fund'].value_counts(ascending=False).index]))
idx_list.append(pd.MultiIndex.from_product([['province'], person_df['province'].value_counts(ascending=False).index]))
idx_list.append(pd.MultiIndex.from_tuples([('total', 'number')]))
idx_list

[MultiIndex([('age_cat', [18, 39]),
             ('age_cat', [40, 64]),
             ('age_cat', [65, 95])],
            ),
 MultiIndex([('gender', 1),
             ('gender', 0)],
            ),
 MultiIndex([('fund',          'Civil servants'),
             ('fund',                   'Rural'),
             ('fund', 'Iranian + Self-employed'),
             ('fund',               'Universal'),
             ('fund',                  'Others'),
             ('fund',                 'Foreign')],
            names=[None, 'fund']),
 MultiIndex([('province', 23),
             ('province',  7),
             ('province',  9),
             ('province', 10),
             ('province',  2),
             ('province',  1),
             ('province', 11),
             ('province', 27),
             ('province',  8),
             ('province',  5),
             ('province', 12),
             ('province', 13),
             ('province', 15),
             ('province', 21),
             ('province',  0),
   

In [7]:
m_idx = []
for temp in idx_list:
    m_idx += temp.to_list()

cols = ['all'] + sorted(list(person_df['cat'].unique()))

table = pd.DataFrame(index=pd.MultiIndex.from_tuples(m_idx), columns=cols)
table

Unnamed: 0,Unnamed: 1,all,0,1,2,3,4
age_cat,"[18, 39]",,,,,,
age_cat,"[40, 64]",,,,,,
age_cat,"[65, 95]",,,,,,
gender,1,,,,,,
gender,0,,,,,,
fund,Civil servants,,,,,,
fund,Rural,,,,,,
fund,Iranian + Self-employed,,,,,,
fund,Universal,,,,,,
fund,Others,,,,,,


## Filling in the Values

In [8]:
def stringer(val, lower, upper, decimals=2):
    val = (val * 100).round(decimals)
    lower = (lower * 100).round(decimals)
    upper = (upper * 100).round(decimals)
    
    s = val.astype(str) + ' (' + lower.astype(str) + '-' + upper.astype(str) + ')'

    return s

st.t.interval(confidence=0.95,
             df=len(person_df)-1,
             loc=person_df['age'].mean(),
             scale=st.sem(person_df['age']))

In [9]:
metrics = ['age_cat', 'gender', 'fund', 'province', 'total']
cols

['all', 0, 1, 2, 3, 4]

In [10]:
normalized_counts_index = pd.MultiIndex.from_tuples([(col, m, idx)for col, (m, idx) in product(cols, m_idx)])
normalized_counts = pd.DataFrame(
    index=normalized_counts_index, 
)
normalized_counts

Unnamed: 0,Unnamed: 1,Unnamed: 2
all,age_cat,"[18, 39]"
all,age_cat,"[40, 64]"
all,age_cat,"[65, 95]"
all,gender,1
all,gender,0
...,...,...
4,province,28
4,province,26
4,province,19
4,province,29


In [11]:


for col in cols:
    for m in metrics:
    
        # print(col, m)

        if col == 'all':
            df = person_df
        else:
            df = person_df.loc[person_df['cat'] == col]
        
        if m == 'total':
            continue

        nobs = len(df)
        
        temp = df[m].value_counts(normalize=True)
        temp_count = df[m].value_counts()

        indexes = product([col], [m], temp.index)

        normalized_counts.loc[indexes, 'val'] = temp.values

        for idx in table.loc[m].index:

            lower, upper = proportion_confint(
                count=temp_count[idx],
                nobs=nobs,
                alpha=0.05,
                method='normal'
            )

            normalized_counts.loc[(col, m, idx), ['lower', 'upper']] = (lower, upper)
        


In [12]:
normalized_counts = normalized_counts.dropna()
s = stringer(normalized_counts['val'], normalized_counts['lower'], normalized_counts['upper'])
table = pd.DataFrame(s).unstack(0)[0]
table

Unnamed: 0,Unnamed: 1,0,1,2,3,4,all
age_cat,"[18, 39]",25.34 (25.19-25.49),26.32 (26.17-26.46),26.24 (26.1-26.39),26.81 (26.65-26.96),25.34 (25.19-25.48),26.02 (25.96-26.09)
age_cat,"[40, 64]",50.49 (50.32-50.67),49.84 (49.68-50.0),50.96 (50.8-51.12),52.03 (51.86-52.2),55.82 (55.65-55.98),51.82 (51.75-51.9)
age_cat,"[65, 95]",24.17 (24.01-24.32),23.84 (23.7-23.98),22.8 (22.66-22.94),21.17 (21.03-21.31),18.85 (18.72-18.98),22.16 (22.1-22.22)
gender,1,37.99 (37.82-38.16),34.23 (34.07-34.38),32.06 (31.9-32.21),30.18 (30.02-30.34),31.18 (31.02-31.33),33.06 (32.99-33.13)
gender,0,62.01 (61.84-62.18),65.77 (65.62-65.93),67.94 (67.79-68.1),69.82 (69.66-69.98),68.82 (68.67-68.98),66.94 (66.87-67.01)
fund,Civil servants,26.12 (25.97-26.28),37.11 (36.95-37.27),44.42 (44.26-44.58),49.5 (49.33-49.67),50.93 (50.76-51.1),41.83 (41.76-41.9)
fund,Rural,39.35 (39.18-39.52),23.6 (23.46-23.74),16.37 (16.24-16.49),11.4 (11.29-11.51),6.48 (6.4-6.56),19.14 (19.08-19.2)
fund,Iranian + Self-employed,11.45 (11.34-11.57),14.16 (14.05-14.28),14.73 (14.61-14.84),14.86 (14.74-14.98),12.31 (12.2-12.42),13.55 (13.5-13.6)
fund,Universal,16.95 (16.82-17.08),17.07 (16.95-17.2),14.51 (14.4-14.63),11.38 (11.27-11.49),6.17 (6.08-6.25),13.21 (13.16-13.26)
fund,Others,5.75 (5.67-5.83),7.85 (7.77-7.94),9.89 (9.79-9.99),12.83 (12.71-12.94),24.11 (23.96-24.25),12.13 (12.08-12.18)


In [13]:
temp = person_df['cat'].value_counts()

for col in cols:

    if col == 'all':
        table.loc[('total', 'number'), col] = len(person_df)
        continue
    
    table.loc[('total', 'number'), col] = temp[col]

temp

cat
1    359121
2    358920
4    341918
3    332538
0    311685
Name: count, dtype: int64

In [14]:
table

Unnamed: 0,Unnamed: 1,0,1,2,3,4,all
age_cat,"[18, 39]",25.34 (25.19-25.49),26.32 (26.17-26.46),26.24 (26.1-26.39),26.81 (26.65-26.96),25.34 (25.19-25.48),26.02 (25.96-26.09)
age_cat,"[40, 64]",50.49 (50.32-50.67),49.84 (49.68-50.0),50.96 (50.8-51.12),52.03 (51.86-52.2),55.82 (55.65-55.98),51.82 (51.75-51.9)
age_cat,"[65, 95]",24.17 (24.01-24.32),23.84 (23.7-23.98),22.8 (22.66-22.94),21.17 (21.03-21.31),18.85 (18.72-18.98),22.16 (22.1-22.22)
gender,1,37.99 (37.82-38.16),34.23 (34.07-34.38),32.06 (31.9-32.21),30.18 (30.02-30.34),31.18 (31.02-31.33),33.06 (32.99-33.13)
gender,0,62.01 (61.84-62.18),65.77 (65.62-65.93),67.94 (67.79-68.1),69.82 (69.66-69.98),68.82 (68.67-68.98),66.94 (66.87-67.01)
fund,Civil servants,26.12 (25.97-26.28),37.11 (36.95-37.27),44.42 (44.26-44.58),49.5 (49.33-49.67),50.93 (50.76-51.1),41.83 (41.76-41.9)
fund,Rural,39.35 (39.18-39.52),23.6 (23.46-23.74),16.37 (16.24-16.49),11.4 (11.29-11.51),6.48 (6.4-6.56),19.14 (19.08-19.2)
fund,Iranian + Self-employed,11.45 (11.34-11.57),14.16 (14.05-14.28),14.73 (14.61-14.84),14.86 (14.74-14.98),12.31 (12.2-12.42),13.55 (13.5-13.6)
fund,Universal,16.95 (16.82-17.08),17.07 (16.95-17.2),14.51 (14.4-14.63),11.38 (11.27-11.49),6.17 (6.08-6.25),13.21 (13.16-13.26)
fund,Others,5.75 (5.67-5.83),7.85 (7.77-7.94),9.89 (9.79-9.99),12.83 (12.71-12.94),24.11 (23.96-24.25),12.13 (12.08-12.18)


In [15]:
normalized_counts

Unnamed: 0,Unnamed: 1,Unnamed: 2,val,lower,upper
all,age_cat,"[18, 39]",0.260210,0.259552,0.260869
all,age_cat,"[40, 64]",0.518215,0.517464,0.518965
all,age_cat,"[65, 95]",0.221575,0.220951,0.222198
all,gender,1,0.330574,0.329868,0.331280
all,gender,0,0.669426,0.668720,0.670132
...,...,...,...,...,...
4,province,17,0.026609,0.026069,0.027148
4,province,28,0.016568,0.016140,0.016996
4,province,26,0.009090,0.008772,0.009408
4,province,19,0.009011,0.008694,0.009328


## Saving person_table

In [17]:
# table.to_csv(r'E:\after_session_03_22\results\person_table.csv')
# table.to_clipboard()