### Importing Libraries

In [7]:
# Import libraries for data manipulation
import pandas as pd
import numpy as np

# Import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import ProbPlot

# Import libraries for building linear regression model using statsmodel
from statsmodels.formula.api import ols
import statsmodels.api as sm

import scipy.stats as stats
from scipy.stats import levene, ttest_ind, f_oneway, kruskal, shapiro, kstest
from scipy.stats import chi2_contingency

# Importing Linear Regression from sklearn
from sklearn.linear_model import LinearRegression

# Import library for preparing data
from sklearn.model_selection import train_test_split

# Import library for data preprocessing
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

### Read the Dataset

In [2]:
df = pd.read_csv("./bodyPerformance.csv")
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [3]:
numeric_data = df.select_dtypes(exclude='object')
categorical_data = df.select_dtypes(include='object')

print(numeric_data.shape)
print(categorical_data.shape)

(13393, 10)
(13393, 2)


### Statistical Tests: Binary Variables

In [9]:
for column in categorical_data.columns[1:]:
    contingency_table = pd.crosstab(df['class'], df["gender"])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    print()
    print(contingency_table)
    print()
    print(f"Chi-square test between Hospital Mortality and {column}:")
    print(f"Chi2 value: {chi2}")
    print("P-value: {:.20f}".format(p))
    print("--------------------------------------------------------")


gender     F     M
class             
A       1484  1864
B       1185  2162
C       1112  2237
D       1145  2204

Chi-square test between Hospital Mortality and class:
Chi2 value: 112.77302615919672
P-value: 0.00000000000000000000
--------------------------------------------------------


**Observation:**
- For all categorical variables, except uncomplicated diabetes, complicated diabetes, peripheral vascular disease and hypothyroidism, the resulting p-value is < 0.05. This implies that, for the mentioned variable, we failed to reject the null hypothesis and that there is no significant association between the presence of these gender and the class.
- If p-value > 0.05, we can assume that there is significant association.