# Proportions of the decision tree for PCR testing

In [None]:
import seaborn as sns
import statsmodels.api as sm
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.display import HTML
from sklearn import tree
import plotly.express as px
import matplotlib.pyplot as plt
import scipy.stats as stats
from joblib import load
from tools import enrich_survey

In [None]:
merged = pd.read_csv('data/shuffled.csv')

In [None]:
WEIGHTING = False

In [None]:
merged = merged.pipe(enrich_survey)

In [None]:
symptoms = ['tiredness',
 'fever',
 'shivers',
 'cough',
 'breathlessness',
 'aches',
 'chest_opression',
 'chest_pain',
 'diarrhea',
 'vomiting',
 'sensoriel', 
 'anosmia',
 'ageusia',
 'anorexia',
 'rash',
 'frostbites',
 'conjunctivitis',
 'other_sympt']

In [None]:
# Preprocess
na_labels = {'pcr_results':-1, }
main = merged.fillna(na_labels)

main = main.replace({'Non':0, 'Oui':1})
main['sensoriel'] = main[['anosmia', 'ageusia']].max(axis=1)
main[symptoms] = main[symptoms].fillna(0).astype('int')

# Groupements 
main['chest'] = main[['chest_pain', 'chest_opression']].max(axis=1)
main['cutan'] = main[['rash', 'frostbites']].max(axis=1)
main['digest'] = main[['vomiting', 'diarrhea']].max(axis=1)
main['breath_thorac'] = main[['chest', 'breathlessness']].min(axis=1)
main['fever_cough'] = main[['fever', 'cough']].min(axis=1)

# Remove duplicates, keep last events
# main = main.sort_values('start_time').groupby('patient_id').agg('last').reset_index()

In [None]:
from collections import defaultdict
SYMPTOM_DICT = {}
for s in ['tiredness', 'fever', 'cough', 'breathlessness', 'aches',
          'anorexia', 'anosmia', 'ageusia', 'headache', 
#           'upper_respiratory',
          'conjunctivitis']:
    SYMPTOM_DICT[s] = [s]
SYMPTOM_DICT['cutaneous'] = ['rash', 'frostbites']
SYMPTOM_DICT['digestive'] = ['diarrhea', 'vomiting', 'abdo_pain']
SYMPTOM_DICT['cardiopulmonary'] = ['breathlessness', 'chest_opression', 'chest_pain']
for k, v in SYMPTOM_DICT.items():
    main[k] = np.any(main[v], axis=1)
SYMPTOMS = list(SYMPTOM_DICT.keys())

SYMPTOMS = ['tiredness',
 'fever',
 'cough',
 'breathlessness',
 'aches',
 'anorexia',
 'anosmia',
 'ageusia',
 'headache',
 'conjunctivitis',
 'cutaneous',
 'digestive',
 'cardiopulmonary']

SEX = ['male', 'female', 'undetermined']
TOBACCO = ['smoker_current', 'no_smoker']
COMORBIDITIES = ['no_comorbidity', 'any_comorbidity', 'respiratory', 'cardio-vascular', 'diabetes', 'obesity']
HOSPITALIZED = ['hospitalized','non_hospitalized']
INCLUSION_REASONS = ['samu', 'urgence']
AGE = main['binned_age'].cat.categories.tolist()
X_weight = main[SEX + TOBACCO + COMORBIDITIES + AGE + SYMPTOMS + HOSPITALIZED]

y_weight = main['test_done'].astype(bool)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=10000)

lr.fit(X_weight, y_weight)

main['sample_weight'] = 1
main['p_test'] = lr.predict_proba(X_weight)[:, 1]
main.loc[merged['test_done'], 'sample_weight'] = 1 / main.loc[main['test_done'], 'p_test']
main.loc[~merged['test_done'], 'sample_weight'] = 1 / (1 - main.loc[~main['test_done'], 'p_test'])
main['sample_weight'] /= main.loc[main['test_done'], 'sample_weight'].sum() / len(main.loc[main['test_done']])

In [None]:
if not WEIGHTING:
    main['sample_weight'] = 1

# Stats descriptives

In [None]:
main['group'] = np.nan
main.loc[main.sensoriel == 1, 'group'] = "Anosmia and Agueusia"
main.loc[(main.sensoriel == 0) & (main.fever_cough == 1), 'group'] = "No anosmia and agueusia - Fever and cough"
main.loc[(main.sensoriel == 0) & (main.fever_cough == 0) & (main.breath_thorac == 1), 'group'] = "No anosmia and agueusia - no fever and cough - breathlessness with chest pain or oppression"
main.loc[(main.sensoriel == 0) & (main.fever_cough == 0) & (main.breath_thorac == 0), 'group'] = "Others"
groups = main.group.unique()

In [None]:
def compute_odd_ratios(main, target="pcr_results"):
    
    main.pcr_results = main.pcr_results.replace(-1, np.nan)
    main['constant'] = 1.

    odd_ratios = {}

    for k in groups:
        y = main[target]
        main[k] = main.group == k
        X = main[[k, 'constant']].astype(float)
        mask = ~y.isna()
        y = y.loc[mask].astype(float)
        X = X.loc[mask]

        logit_mod = sm.GLM(y, X, family=sm.families.Binomial(), var_weights=main.loc[mask, 'sample_weight'])
        logit_res = logit_mod.fit()
        mean = logit_res.params[k]
        se = logit_res.bse[k]

        odd_ratio = np.exp(mean - se).round(2), np.exp(mean).round(2), np.exp(mean + se).round(2)
        odd_ratios[k] = f'{odd_ratio[1]:.2f} [{odd_ratio[0]:.2f}-{odd_ratio[2]:.2f}]'
        
    return odd_ratios

In [None]:
def create_grouped(main, cols):
    
    #compute on tested patients
    main_tested = main[main.pcr_result!=-1]
    patient_id_count = main_tested.groupby(['group'])['sample_weight'].sum()
    pcr_result_sum = main_tested.loc[main_tested['pcr_positive'] == 1].groupby(['group'])['sample_weight'].sum()
    pcr_result_all = main_tested.groupby(['group'])['sample_weight'].sum()
    pcr_result_mean = pcr_result_sum / pcr_result_all
    
    #rename columns
    grouped = pd.DataFrame({'patient_count_tested': patient_id_count, 'pcr+': pcr_result_mean, 'pcr+_count': pcr_result_sum})
    
    #compute % on tested patients
    grouped['patient_%_tested'] = grouped['patient_count_tested'] / grouped.patient_count_tested.sum()
    
    #compute on all patients
    grouped['patient_count_all'] = main.groupby(['group'])['sample_weight'].sum()
    grouped = grouped.rename(columns={'patient_id':'patient_count_all'})
    
    #compute % on all patients
    grouped['patient_%_all'] = grouped['patient_count_all'] / grouped.patient_count_all.sum()
    
    odd_ratios = compute_odd_ratios(main)
    grouped['odds_ratio']= np.nan
    grouped = grouped.reset_index()
    for k in groups:
        grouped.loc[grouped.group==k, 'odds_ratio'] = str(odd_ratios[k])
    
    grouped_cols=grouped[cols]
    
    return grouped_cols

In [None]:
cols_ = ['group', 'patient_count_all', 'patient_%_all', 'patient_count_tested', 'patient_%_tested', 'pcr+_count', 'pcr+', 'odds_ratio']
grouped = create_grouped(main, cols_)

c = ['patient_count_all', 'patient_count_tested', 'pcr+_count']
grouped[c] = grouped[c].round(0).astype(int)

all_cohort = grouped.patient_count_all.sum()
tested_cohort = grouped.patient_count_tested.sum()
pcr_cohort = grouped['pcr+_count'].sum()


for i, row in grouped.iterrows():
    grouped.loc[i, 'patient_count_all'] = f"{grouped.loc[i, 'patient_count_all']} ({grouped.loc[i, 'patient_%_all'] * 100:.1f}%)"
    grouped.loc[i, 'patient_count_tested'] = f"{grouped.loc[i, 'patient_count_tested']} ({grouped.loc[i, 'patient_%_tested'] * 100:.1f}%)"
    grouped.loc[i, 'pcr+_count'] = f"{grouped.loc[i, 'pcr+_count']} ({grouped.loc[i, 'pcr+'] * 100:.1f}%)"

    
grouped = grouped[['group'] + c + ['odds_ratio']]

grouped.columns = pd.Index(['Group', f'Count in whole cohort (% of whole cohort, N={all_cohort})', f'Count in tested cohort (% of tested cohort, N={tested_cohort})', 'Count of PCR+ (% PCR+ among tested)', 'Odds ratio for PCR+'])

grouped.to_csv(f'output/fig4_counts{"_correct" if WEIGHTING else ""}.csv')

none = main[(main.pcr_result!=-1) & (main.group == "Others")]

df = none[symptoms + ['chest', 'cutan', 'digest', 'breath_thorac']].astype('float').copy()

df.iloc[:, :] = df.iloc[:, :] * none['sample_weight'].values[:, None]

other_symptoms = df.sum(axis=0) / none['sample_weight'].sum()

other_symptoms = (other_symptoms.sort_values(ascending=False) * 100).round(1)

other_symptoms.to_csv(f'output/fig4_other_symptoms{"_correct" if WEIGHTING else ""}.csv')

In [None]:
grouped