**Author:** Boris Kundu

**Problem Statement:** Medical data analysis through statistical tests to check which variables are potentially a cause of death.

**Dataset:** patient_data.csv

In [1]:
# Import packages
import pandas as pd
from scipy import stats
import numpy as np

In [4]:
# Read data
df = pd.read_csv('patient_data.csv')
df.head()

Unnamed: 0,death,Na+,DBP,PLT,ivabradine,MRA
0,0,136.01,126.01,196.01,1,0
1,0,147.01,108.01,245.01,0,1
2,0,133.01,109.01,219.01,0,1
3,0,150.01,114.01,294.01,1,0
4,0,151.01,95.01,293.01,0,1


In [6]:
# Check data model
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   death       10 non-null     int64  
 1   Na+         10 non-null     float64
 2   DBP         10 non-null     float64
 3   PLT         10 non-null     float64
 4   ivabradine  10 non-null     int64  
 5   MRA         10 non-null     int64  
dtypes: float64(3), int64(3)
memory usage: 608.0 bytes


In [10]:
# Perfoem tests
def perform_tests(data):
    cat_feat = []
    num_feat = []
    # Separate categorical nad nemeric features
    for feature in data.columns:
        if feature != 'death':
            if(data[feature].dtype == np.int64):
                cat_feat.append(feature)
            else:
                num_feat.append(feature)
                    
    # Initializing test results
    chi_square = []
    shapiro_wilk = []
    ttest = []
    mann_whitney = []
    # Perform chi-squared test of independence between each categorical variable and 'death' variable
    for cat in cat_feat:
        con_table = pd.crosstab(data['death'], data[cat])
        c, p, dof, expected = stats.chi2_contingency(con_table)
        chi_square.append((cat,round(p,4)))
    # Split data by 'death' variable                    
    df_death_0 = data[data['death'] == 0]
    df_death_1 = data[data['death'] == 1]
    # Perform test on numeric features
    for num in num_feat:
        # Perform Shapiro for normal distribution test
        shapiro_test_0 = stats.shapiro(df_death_0[num]).pvalue
        shapiro_test_1 = stats.shapiro(df_death_1[num]).pvalue
        shapiro_wilk.append((num,(round(shapiro_test_0, 4), round(shapiro_test_1,4))))
        if shapiro_test_0 > 0.05 and shapiro_test_1 > 0.05: # Both normal distribution then run unpaired t-test
            ttest_res = stats.ttest_ind(df_death_0[num], df_death_1[num], equal_var = False)
            ttest.append((num, round(ttest_res.pvalue,4)))        
        else: # Not a normal distribution then perform Mann-Whiteney U test
            mannwhitneyu_res = stats.mannwhitneyu(df_death_0[num], df_death_1[num])
            mann_whitney.append((num,round(mannwhitneyu_res.pvalue,4)))       
    return {
        'mann_whitney': mann_whitney,
        'ttest': ttest,
        'chi_square': chi_square,
        'shapiro_wilk': shapiro_wilk
    }

In [11]:
# Run tests
test_results = perform_tests(df)
test_results

{'mann_whitney': [],
 'ttest': [('Na+', 0.6844), ('DBP', 0.4346), ('PLT', 0.423)],
 'chi_square': [('ivabradine', 1.0), ('MRA', 1.0)],
 'shapiro_wilk': [('Na+', (0.192, 0.9276)),
  ('DBP', (0.8769, 0.2463)),
  ('PLT', (0.3898, 0.6919))]}