In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [95]:
import scipy.stats as stats
import numpy as np
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from scipy.stats import chi2_contingency

In [66]:
# Reading the dataset
df = pd.read_csv('../input/covid-19-case-surveillance/data.csv')

  df = pd.read_csv('../input/covid-19-case-surveillance/data.csv')


In [4]:
df.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2020-12,MN,27.0,HENNEPIN,27053.0,18 to 49 years,Female,Unknown,Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
1,2022-02,MI,26.0,OAKLAND,26125.0,18 to 49 years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,
2,2022-01,CA,6.0,LOS ANGELES,6037.0,18 to 49 years,Male,Unknown,Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Unknown,No,Missing,Missing,
3,2022-01,NV,32.0,CLARK,32003.0,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Routine surveillance,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
4,2020-05,MA,25.0,MIDDLESEX,25017.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19020962 entries, 0 to 19020961
Data columns (total 19 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   case_month                       object 
 1   res_state                        object 
 2   state_fips_code                  float64
 3   res_county                       object 
 4   county_fips_code                 float64
 5   age_group                        object 
 6   sex                              object 
 7   race                             object 
 8   ethnicity                        object 
 9   case_positive_specimen_interval  float64
 10  case_onset_interval              float64
 11  process                          object 
 12  exposure_yn                      object 
 13  current_status                   object 
 14  symptom_status                   object 
 15  hosp_yn                          object 
 16  icu_yn                           object 
 17  death_

In [6]:
print(f"Shape of data: {df.shape}")
print(f"Number of missing values in the data: {df.isnull().sum().sum()}")

Shape of data: (19020962, 19)
Number of missing values in the data: 52133780


# 1 - Test Choice: Logistics Regression Test 

**Justification**:
1. The Logistics Regression Test appropriate as we are dealing with categorical variables. First (death due to COVID-19) is dependent categorical (death or no death), Second (patient demographics such as age, gender, race, etc.) are multiple independent and categorical columns.

2. This test helps to determine whether there is a significant association between two categorical variables.

# 2 - The hypotheses

1. Null Hypothese: There is **no** association between the probability of death due to COVID-19 and patient demographics. 
1. Alternative Hypothesis: There is an association between the probability of death due to COVID-19 and patient demographics. 

In [20]:
# Making a copy of the dataset
data = df.copy()

In [21]:
# Dropping the unneeded columns and droping the Null values in the needed columns

columns_to_drop = ['case_month', 'res_state', 'res_county', 'process', 'exposure_yn', 'icu_yn', 'underlying_conditions_yn', 'state_fips_code', 'county_fips_code', 'case_positive_specimen_interval', 'case_onset_interval', 'current_status', 'symptom_status', 'hosp_yn']
data.drop(columns=columns_to_drop, inplace=True)

# Replacing 'Missing' and 'Unknown' with NaNs
data.replace(['Missing', 'Unknown','Other'], np.nan, inplace=True)

# Dropping rows with missing values in specific columns
columns_with_missing_values = ['death_yn', 'age_group', 'sex', 'race', 'ethnicity']
data.dropna(subset=columns_with_missing_values, inplace=True)


In [22]:
print(f"Shape of data: {data.shape}")

Shape of data: (3564207, 5)


In [24]:
data.head(10)

Unnamed: 0,age_group,sex,race,ethnicity,death_yn
4,65+ years,Female,White,Non-Hispanic/Latino,No
10,18 to 49 years,Female,Black,Non-Hispanic/Latino,No
12,18 to 49 years,Female,White,Hispanic/Latino,No
13,50 to 64 years,Female,White,Hispanic/Latino,No
16,0 - 17 years,Male,White,Non-Hispanic/Latino,No
26,50 to 64 years,Male,White,Non-Hispanic/Latino,No
34,18 to 49 years,Female,Black,Non-Hispanic/Latino,No
44,50 to 64 years,Female,White,Hispanic/Latino,No
46,0 - 17 years,Male,White,Non-Hispanic/Latino,No
52,65+ years,Female,White,Non-Hispanic/Latino,No


In [25]:
# Applying one hot code
data = pd.get_dummies(data, columns=['age_group', 'sex', 'race', 'ethnicity'], drop_first=True)

In [33]:
# Replacing all the values with 1 and 0
data['death_yn'] = data['death_yn'].map({'Yes': 1, 'No': 0})

In [41]:
# Making the data 1 and 0
data = data.applymap(lambda x: 1 if x else 0)

  data = data.applymap(lambda x: 1 if x else 0)


In [42]:
data.head()

Unnamed: 0,death_yn,age_group_18 to 49 years,age_group_50 to 64 years,age_group_65+ years,sex_Male,race_Asian,race_Black,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,race_White,ethnicity_Non-Hispanic/Latino
4,0,0,0,1,0,0,0,0,0,1,1
10,0,1,0,0,0,0,1,0,0,0,1
12,0,1,0,0,0,0,0,0,0,1,0
13,0,0,1,0,0,0,0,0,0,1,0
16,0,0,0,0,1,0,0,0,0,1,1


In [43]:
# Splitting the data for the regression
X = data.drop(columns=['death_yn'])
y = data['death_yn']

In [45]:

logreg = LogisticRegression()
logreg.fit(X, y)

In [46]:
# Adding constant to the independent columns
X = sm.add_constant(X)

In [49]:
# applying the model 
model = sm.Logit(y, X)
result = model.fit()

         Current function value: 0.068812
         Iterations: 35




In [51]:
result.summary()

0,1,2,3
Dep. Variable:,death_yn,No. Observations:,3564207.0
Model:,Logit,Df Residuals:,3564196.0
Method:,MLE,Df Model:,10.0
Date:,"Thu, 23 May 2024",Pseudo R-squ.:,0.3059
Time:,11:46:03,Log-Likelihood:,-245260.0
converged:,False,LL-Null:,-353340.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-23.2487,276.642,-0.084,0.933,-565.457,518.959
age_group_18 to 49 years,17.3059,276.642,0.063,0.950,-524.902,559.514
age_group_50 to 64 years,20.0642,276.642,0.073,0.942,-522.144,562.272
age_group_65+ years,22.6268,276.642,0.082,0.935,-519.581,564.835
sex_Male,0.4694,0.008,59.039,0.000,0.454,0.485
race_Asian,0.6296,0.101,6.219,0.000,0.431,0.828
race_Black,-0.2092,0.099,-2.109,0.035,-0.404,-0.015
race_Multiple/Other,-1.5993,0.142,-11.302,0.000,-1.877,-1.322
race_Native Hawaiian/Other Pacific Islander,-12.1822,249.006,-0.049,0.961,-500.226,475.861


In [56]:
P_values = result.pvalues
print("P-values:\n", P_values)

P-values:
 const                                          9.330255e-01
age_group_18 to 49 years                       9.501193e-01
age_group_50 to 64 years                       9.421819e-01
age_group_65+ years                            9.348131e-01
sex_Male                                       0.000000e+00
race_Asian                                     4.988423e-10
race_Black                                     3.494668e-02
race_Multiple/Other                            1.288777e-29
race_Native Hawaiian/Other Pacific Islander    9.609803e-01
race_White                                     1.000775e-10
ethnicity_Non-Hispanic/Latino                  0.000000e+00
dtype: float64


In [58]:
alpha = 0.05
significant_vars = P_values[P_values < alpha]
print("Significant variables at alpha =", alpha, ":\n", significant_vars)

Significant variables at alpha = 0.05 :
 sex_Male                         0.000000e+00
race_Asian                       4.988423e-10
race_Black                       3.494668e-02
race_Multiple/Other              1.288777e-29
race_White                       1.000775e-10
ethnicity_Non-Hispanic/Latino    0.000000e+00
dtype: float64


# Result

The claim is **partially valid**. While some demographic factors (such as sex, certain races, and ethnicity) show a strong association with COVID-19 mortality, others (such as age and some race categories) do not show a significant association at the 0.05 significance level.

# **Claim 2**: 
There is a statistically significant association between patients' underlying medical conditions and the death rate among individuals diagnosed with COVID-19. 

# The Hypothese:

1. Null Hypothesis: there is *no* association between having other disease and desth with COVID-19

2. Alternative Hypothesis: there is an association between having other disease and desth with COVID-19



# The Hypothsis test: the Chi square test

**justification:**

1. this test is suitable when applying on two catgorical data such as the death (Yes ot No) status and the other diseases (Yes or No)


In [91]:
new_df = df.copy()

In [92]:
new_df['underlying_conditions_yn'].value_counts()

underlying_conditions_yn
Yes    687755
No      12613
Name: count, dtype: int64

In [93]:
new_df = new_df.drop(columns=[ 'age_group', 'sex', 'race', 'ethnicity','case_month', 'res_state', 'res_county', 'process', 'exposure_yn', 'icu_yn','state_fips_code', 'county_fips_code', 'case_positive_specimen_interval', 'case_onset_interval', 'current_status', 'symptom_status', 'hosp_yn'])

# Replacing 'Missing' and 'Unknown' with NaNs
df.replace(['Missing', 'Unknown','Other'], np.nan, inplace=True)

new_df = new_df.dropna(subset=['death_yn', 'underlying_conditions_yn'])

In [98]:
# Create a contingency table
contingency_table = pd.crosstab(df['death_yn'], df['underlying_conditions_yn'])

# Perform chi-square test of independence
result = chi2_contingency(contingency_table)

In [104]:
# P_Vlaue
print("p-value:", result.pvalue)

p-value: 9.865158227128456e-72


# ***Result***

**According to the p value(9.865158227128456e-72 < 0.05), the Null hypothesis is strongally rejected, there is no association between the patients with other diseaes and the death of COVID-19**