In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import numpy as np
from sklearn import preprocessing
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, Normalizer
import statsmodels.stats.api as sms
import math 
import scipy.stats as stats
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.proportion import proportion_effectsize
from statsmodels.stats.proportion import proportions_chisquare



df = pd.read_csv("summary_stats_one_encounter.csv")

df['Race'] = df['Race'].replace({'Declined': 'Multi-racial or other race', 'American Indian/Alaska Native': 'Multi-racial or other race',
                     'Unknown/Unspecified': 'Multi-racial or other race','Native Hawaiian/Pacific Islander': 'Multi-racial or other race',
                     'Other (Specify)': 'Multi-racial or other race', 'Multiracial/Two or More Races': 'Multi-racial or other race'})


In [None]:
def two_proprotions_confint(success_a, size_a, success_b, size_b, significance = 0.05):
    """
    A/B test for two proportions;
    given a success a trial size of group A and B compute
    its confidence interval;
    resulting confidence interval matches R's prop.test function

    Parameters
    ----------
    success_a, success_b : int
        Number of successes in each group

    size_a, size_b : int
        Size, or number of observations in each group

    significance : float, default 0.05
        Often denoted as alpha. Governs the chance of a false positive.
        A significance level of 0.05 means that there is a 5% chance of
        a false positive. In other words, our confidence level is
        1 - 0.05 = 0.95

    Returns
    -------
    prop_diff : float
        Difference between the two proportion

    confint : 1d ndarray
        Confidence interval of the two proportion test
    """
    prop_a = success_a / size_a
    prop_b = success_b / size_b
    var = prop_a * (1 - prop_a) / size_a + prop_b * (1 - prop_b) / size_b
    se = np.sqrt(var)

    # z critical value
    confidence = 1 - significance
    z = stats.norm(loc = 0, scale = 1).ppf(confidence + significance / 2)

    # standard formula for the confidence interval
    # point-estimtate +- z * standard-error
    prop_diff = round((prop_b - prop_a)*100,1)
    confint = prop_diff + (np.array([-1, 1]) * z * se)*100
    confint = np.round(confint, 1)
    return prop_diff, confint

### Comparing the distribution of each variable by the outcome  (positive DX and negative DX)

In [None]:
#### Remove 0's and All

d = {}
variables = ["Gender","Race","Dementia","Alcohol","Stroke","TIA","IntraCranial Hemorrhage", "Severe Illness","Hearing impairment",
            "Visual impairment", "Antibiotic", "Opioid", "Anticholinergic", "Benzodiazepine", "In_Hosp_Mortality"]

for i in variables:

    # Get frequency table of feature and outcome variable DX (count and percent tables)
    count = pd.crosstab(index=df[i], columns=df["DX"], margins=True)
    percent = pd.crosstab(index=df[i], columns=df["DX"], margins=True).add_prefix('Percent ')
    percent = round(100*percent/percent.loc['All'], 1)
    d[i] = pd.concat([count, percent], axis=1, sort=False)
    
    
    # Get difference between positive and negative DX and confidence intervals
    for row in list(d[i].index):
        num_1 = d[i].loc[row][1]
        total_1 = d[i].loc['All'][1]
        num_0 = d[i].loc[row][0]
        total_0 = d[i].loc['All'][0]
        cohen_h = proportion_effectsize(num_0/total_0, num_1/total_1)
        cohen_h
        d[i].at[row, 'Effect Size'] = round(cohen_h, 3)
       
    # Combine counts and proportions into one cell
    #d[i].at[row, 'Difference'] = f'{Difference[0]} ({Difference[1][0]}, {Difference[1][1]})'
    d[i][0] = [f'{int(j[0])} ({j[1]})' for j in d[i][[0, 'Percent 0.0']].values]
    d[i][1] = [f'{round(j[0])} ({j[1]})' for j in d[i][[1, 'Percent 1.0']].values]
    d[i]['All'] = [f'{int(j[0])} ({j[1]})' for j in d[i][['All', 'Percent All']].values] 

    d[i].drop('All', inplace = True)

    
    # No longer need percent columns
    d[i].drop(columns = ['Percent 1.0', 'Percent All', 'Percent 0.0'], inplace = True)
    
    if d[i].index[0] == 0:
        d[i].drop(0, inplace = True)
    if 'N' in list(d[i].index):
        if i != "In_Hosp_Mortality":
            d[i] = d[i].drop('N')
    

categorical_df = pd.concat([d[i] for i in variables], keys = variables)

column_titles = ['All',0,1, 'Effect Size']
categorical_df = categorical_df.reindex(columns=column_titles)

categorical_df

### Numerical variables

In [None]:
from numpy.random import randn
from numpy.random import seed
from numpy import mean
from numpy import var
from math import sqrt

# function to calculate Cohen's d for independent samples
def cohend(d1, d2):
# calculate the size of samples
    n1, n2 = len(d1), len(d2)
    # calculate the variance of the samples
    s1, s2 = var(d1, ddof=1), var(d2, ddof=1)
    # calculate the pooled standard deviation
    s = sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
    # calculate the means of the samples
    u1, u2 = mean(d1), mean(d2)
    # calculate the effect size
    return (u1 - u2) / s

In [None]:
# Get means/SD of positive DX and negative DX

import scipy.stats as st
numerical_df = pd.DataFrame(columns = ['All', 'Negative DX','Positive DX', 'Difference', 'P Value', 'SMD'])
variables = ['Age', 'Barthel', 'HR', 'RR', 'Temp F', 'BMI']
for i in variables:
    row = []
        
    # Mean and SD for all data
    all = df[i]
    mean_all = np.mean(all)
    std_all = np.std(all)
    row.append(f'{round(mean_all ,1)} ({round(std_all,1)})')
        
    # Group by negative DX
    neg = df[df.DX == 0][i]
    mean_neg = np.mean(neg)
    std_neg = np.std(neg)
    row.append(f'{round(mean_neg ,1)} ({round(std_neg,1)})')
        
    # Group by positive DX
    pos = df[df.DX == 1][i]
    mean_pos = np.mean(pos)
    std_pos = np.std(pos)
    row.append(f'{round(mean_pos ,1)} ({round(std_pos,1)})')
        
    # Get the difference between means of positive and negative DX 
    df1 = df[df.DX == 0][i]
    df2 = df[df.DX == 1][i]
    mean_diff = mean_neg - mean_pos
    pooled_std = np.sqrt(((std_pos**2)*(len(pos)-1)+(std_neg**2)*(len(neg)-1))/ (len(pos)+len(neg)-2))
    row.append(f'{round(mean_diff, 1)} ({round(pooled_std,1)})')  
        
    P_value = round(stats.ttest_ind(pos, neg)[1],3)
    row.append(f'{P_value}')
        
    cohen = cohend(pos,neg)
    row.append(round(cohen, 3))
        
    numerical_df.loc[i] = row

        
numerical_df = numerical_df.drop(columns = ['Difference', 'P Value'], axis = 1)

In [None]:
numerical_df

In [None]:
import seaborn as sns
age = sns.regplot(x= 'Age', y= 'DX', data= df, logistic= True).set_title("Age Log Odds Linear Plot")
age

In [None]:
barthel = sns.regplot(x= 'Barthel', y= 'DX', data= df, logistic= True).set_title("Barthel Log Odds Linear Plot")
barthel

In [None]:
HR = sns.regplot(x= 'HR', y= 'DX', data= df, logistic= True).set_title("HR Log Odds Linear Plot")
HR

In [None]:
RR = sns.regplot(x = 'RR', y = 'DX', data = df, logistic = True).set_title("RR Log Odds Linear Plot")
RR

In [None]:
Temp = sns.regplot(x = 'Temp F', y = 'DX', data = df, logistic = True).set_title("RR Log Odds Linear Plot")
Temp