In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from causallib.datasets import load_nhefs
%matplotlib inline
from causallib.datasets import load_nhefs
from causallib.estimation import IPW
from causallib.evaluation import PropensityEvaluator
from sklearn.linear_model import LogisticRegression
import statsmodels.stats.multitest as smm

In [None]:
run_on = "quest"
#run_on = "doc_child"
#run_on = "doc_adult"

In [None]:
if run_on == "quest":
    results_fname = "odds_ratio_from_quest_final.txt"
elif run_on == "doc_child":    
    results_fname = "odds_ratio_from_doc_children_final.txt"
else :
    results_fname = "odds_ratio_from_doc_children_final.txt"

#results_fname = "odds_ratio_from_doc_adults_p_value.txt"
#results_fname = "odds_ratio_from_doc_children.txt"
folder = 'c:/corona_segal/'

In [None]:
if run_on == "quest":
    test_matrix_for_ipw = pd.read_csv('c:/corona_segal/matrix_for_odds_ratio_quest.txt', sep="\t")
else :    
    test_matrix_for_ipw = pd.read_csv('c:/corona_segal/matrix_for_odds_ratio_doc.txt', sep="\t")

In [None]:
if run_on == "doc_adult":  
    test_matrix_for_ipw = test_matrix_for_ipw[test_matrix_for_ipw['age']>=18]
elif  run_on == "doc_child":  
    test_matrix_for_ipw = test_matrix_for_ipw[test_matrix_for_ipw['age']<18]

In [None]:
test_matrix_for_ipw['corona_test_date_t'] = pd.to_datetime(test_matrix_for_ipw['corona_test_date'], format='%Y%m%d')
MIN_DATE = test_matrix_for_ipw[['corona_test_date_t']].min().min()
test_matrix_for_ipw['test_date_correction'] = (test_matrix_for_ipw['corona_test_date_t']-MIN_DATE)/np.timedelta64(1,'D')

In [None]:
df1 = pd.DataFrame(columns=['Run_type','Signal_name','Signal_name_article'], dtype =str)
df2 = pd.DataFrame(columns=['Odds Ratio', '5%', '95%','count_with_signal','p_value','FDR'], dtype =float)
odds_results3 = pd.concat([df1,df2],axis=1)

In [None]:
odds_ratio_matrix = test_matrix_for_ipw[test_matrix_for_ipw['corona_ind']>=0]

In [None]:
if run_on == "quest":
    symp_cols = ['chom_375_379', 'chom_38_40',
                'chom_up_to_374',
                  'symp_ayefut', 'symp_bchilot_akahot', 'symp_bilbul',
                'symp_godesh_nazelet', 'symp_keev_garon', 'symp_keev_rosh',
                'symp_keev_shririm', 'symp_kotzer_neshima', 'symp_none', 'symp_other',
                'symp_shilshul', 'symp_shiul', 'symp_shiul_leicha', 'symp_shiul_yavesh',
                'symp_taam_reach', 'symp_zmarmoret']
else :
    symp_cols = ['Abdominal pain', 'Arthralgia',
           'Chest Pain or discomfort', 'Conjunctivitis', 'Cough', 'Diarrhea',
           'Disturbance Of Skin Sensation',
           'Disturbances Of Sensation Of Smell And Taste', 'Dizziness',
           'Dyspnea and or Shortness of breath', 'Emotoional Disturbance',
           'Fatigue', 'Fever',
           'General symptoms ( Amnesia Chills Generalized pain Hypothermia)',
           'Headache', 'Hearing Loss', 'Lympadenopathy', 'Myalgia',
           'Nausea and or vomiting', 'Neuralgia', 'Palpitation', 'Rash',
           'Runny nose and or nasal congestion', 'Sleep disturbance',
           'Sore throat', 'Speech disturbance', 'Syncope', 'Tachycardia',
           'Voice Disturbance', 'Weight loss']    

In [None]:
def run_odds_no_weight (name, my_list,odds_results3):
    for column in symp_cols:
            print(column)
            curr_signal = column 
            count_signal = odds_ratio_matrix[odds_ratio_matrix['corona_ind']==1][curr_signal].sum()
            if (count_signal>0):
                new_list = [curr_signal] + my_list
                x = odds_ratio_matrix[new_list]
                y = odds_ratio_matrix['corona_ind']
                x = sm.add_constant(x)
                logit_mod = sm.Logit(y, x)
                logit_res = logit_mod.fit()
                params = logit_res.params
                p_value = logit_res.pvalues[curr_signal]
                conf = logit_res.conf_int()
                conf['Odds Ratio'] = params
                conf.columns = ['5%','95%','Odds Ratio']
                conf_exp = np.exp(conf)
                conf_exp['Signal_name'] = curr_signal
                conf_exp['Run_type'] = name
                conf_exp['count_with_signal'] = odds_ratio_matrix[[curr_signal]].sum()
                conf_exp['p_value'] = p_value
                odds_results3 = odds_results3.append(conf_exp.iloc[1],ignore_index=True) 
           
    return odds_results3


In [None]:
odds_results3 = odds_results3.iloc[0:0]
odds_results3 = run_odds_no_weight('Basic',[],odds_results3 )
odds_results3 = run_odds_no_weight('Age_gender',['age', 'gender'],odds_results3 )
odds_results3 = run_odds_no_weight('Age_gender_disease',['age', 'gender', 'id_with_disease'],odds_results3 )
odds_results3 = run_odds_no_weight('Age_gender_disease_time',['age', 'gender', 'id_with_disease', 'test_date_correction'],odds_results3 )

In [None]:
list_of_runs = odds_results3.Run_type.unique()
for run_type in list_of_runs :
    print(run_type)
    vec_p_value = odds_results3[odds_results3['Run_type']==run_type]['p_value']
    fdr = fdrcorrection(vec_p_value)[1]
    odds_results3.loc[odds_results3['Run_type']==run_type,'FDR'] = fdr

## Write to file

In [None]:
odds_results3.to_csv(folder+results_fname, index=False)

## IPW

In [None]:
y_vec = test_matrix_for_ipw['corona_test_ind']
x_mat = test_matrix_for_ipw[test_matrix_for_ipw.columns.difference(['id_with_disease','ipw', 'symp_none','corona_test_ind','numerator', 'test_date_correction','corona_ind','corona_test_date','corona_test_month','corona_test_date_t','recover_ind','recover_diff_date'])]

In [None]:
x_mat.columns

In [None]:
learner = LogisticRegression(solver="liblinear")
ipw = IPW(learner)
ipw.fit(x_mat, y_vec)
ipw_vec = ipw.compute_weights(x_mat, y_vec)
test_matrix_for_ipw['ipw'] = ipw_vec

In [None]:
from sklearn import metrics
plots=["roc_curve", "pr_curve", "weight_distribution", 
        "calibration", "covariate_balance_love", "covariate_balance_slope"]
metrics = {"roc_auc": metrics.roc_auc_score,
            "avg_precision": metrics.average_precision_score,}
evaluator = PropensityEvaluator(ipw)
results = evaluator.evaluate_cv(x_mat, y_vec, y_vec, 
                                 plots=plots, metrics_to_evaluate=metrics)

In [None]:
odds_ratio_matrix_with_ipw = test_matrix_for_ipw[test_matrix_for_ipw['corona_ind']>=0]

In [None]:
n_row = odds_ratio_matrix_with_ipw.shape[0]

In [None]:
factor = odds_ratio_matrix_with_ipw['ipw'].sum()/n_row

In [None]:
test_vec_w3 = odds_ratio_matrix_with_ipw['ipw']/factor

In [None]:
test_vec_y = odds_ratio_matrix_with_ipw['corona_ind']
train_data = pd.DataFrame(columns=['Successes', 'Failures'], dtype =int)
train_data['Successes'] = test_vec_y
train_data['Failures'] = 1-test_vec_y

In [None]:
def run_odds_with_weight (name, my_list,odds_results3):
    for column in symp_cols:
            curr_signal = column 
            new_list = [curr_signal] + my_list
            x = odds_ratio_matrix_with_ipw[new_list]
            y = odds_ratio_matrix_with_ipw['corona_ind']
            x = sm.add_constant(x)
            logit_mod = sm.Logit(y, x)
            #logit_res = logit_mod.fit()
            logit_res = sm.GLM(train_data[['Successes','Failures']],x,family=sm.families.Binomial(sm.families.links.logit),var_weights=test_vec_w3).fit()
            params = logit_res.params
            conf = logit_res.conf_int()
            p_value = logit_res.pvalues[curr_signal]
            conf['Odds Ratio'] = params
            conf.columns = ['5%','95%','Odds Ratio']
            conf_exp = np.exp(conf)
            conf_exp['Signal_name'] = curr_signal
            conf_exp['Run_type'] = name
            conf_exp['count_with_signal'] = odds_ratio_matrix_with_ipw[[curr_signal]].sum()
            conf_exp['p_value'] = p_value
            odds_results3 = odds_results3.append(conf_exp.iloc[1],ignore_index=True) 
           
    return odds_results3

In [None]:
odds_results3 = run_odds_with_weight('ipw_Basic',[],odds_results3 )
odds_results3 = run_odds_with_weight('ipw_Age_gender',['age', 'gender'],odds_results3 )
odds_results3 = run_odds_with_weight('ipw_Age_gender_disease',['age', 'gender', 'id_with_disease'],odds_results3 )
odds_results3 = run_odds_with_weight('ipw_Age_gender_disease_time',['age', 'gender', 'id_with_disease', 'test_date_correction'],odds_results3 )

## FDR

In [None]:
 odds_results3.Run_type.unique()

In [None]:
list_of_runs = odds_results3.Run_type.unique()
for run_type in list_of_runs :
    print(run_type)
    vec_p_value = odds_results3[odds_results3['Run_type']==run_type]['p_value']
    fdr = fdrcorrection(vec_p_value)[1]
    odds_results3.loc[odds_results3['Run_type']==run_type,'FDR'] = fdr

## rename

In [None]:
rename_dict = {'chom_38_40': 'Fever ',
               'symp_taam_reach': 'Loss of taste or smell',
               'symp_bilbul': 'Confusion',
               'symp_shilshul': 'Diarrhea',
               'symp_zmarmoret': 'Chills',
               'symp_ayefut': 'Fatigue',
               'symp_bchilot_akahot': 'Nausea or vomiting',
               'symp_godesh_nazelet': 'Runny nose',
               'symp_keev_garon': 'Sore throat',
               'symp_keev_rosh': 'Headache',
               'symp_keev_shririm': 'Muscle pain',
               'symp_kotzer_neshima': 'Shortness of breath',
               'symp_other': 'Other symptom',
               'symp_shiul_leicha': 'Moist cough',
               'symp_shiul_yavesh': 'Dry cough',
               'symp_shiul': 'Cough',
               'chom_375_379': 'Body temperature 37.5-38',
               'chom_up_to_374':'Body temperature under 37.5',
               'symp_none': 'No symptoms'
               }
   

In [None]:
for col in symp_cols:
    odds_results3.loc[odds_results3['Signal_name']==col,'Signal_name_article'] = rename_dict[col]


In [None]:
odds_results3.to_csv(folder+results_fname, index=False)