# Importing required libraries and dataset
- All data are imported as tables
- Display first 5 rows of the tables

In [15]:
#importing required libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import chi2_contingency
#importing all the data as table and tranferring to data frames
dataset1=pd.read_parquet('data/interim/df_Hospitalisation_details.parquet')
df_Hospitalisation_details = pd.DataFrame(dataset1)
dataset2=pd.read_parquet('data/interim/df_Medical_Examinations.parquet')
df_Medical_Examinations = pd.DataFrame(dataset2)
dataset3=pd.read_parquet('data/interim/df_Names.parquet')
df_Names = pd.DataFrame(dataset3)
# Display five rows from each table
print('Table of Hospitalisation details (first 5 rows):')
print(df_Hospitalisation_details.head(5))
print('                                ')
print('Table of Medical Examinations:')
print(df_Medical_Examinations.head(5))
print('                                ')
print('Table of Names:')
print(df_Names.head(5))

Table of Hospitalisation details (first 5 rows):
  Customer ID  year month  date  children  charges  Hospital tier  City tier  \
0      Id2335  1992   Jul     9         0   563.84            1.0        2.0   
1      Id2334  1992   Nov    30         0   570.62            1.0        0.0   
2      Id2333  1993   Jun    30         0   600.00            1.0        0.0   
3      Id2332  1992   Sep    13         0   604.54            2.0        2.0   
4      Id2331  1998   Jul    27         0   637.26            2.0        2.0   

   State ID_R1011  State ID_R1012  State ID_R1013  patient age  
0             0.0             0.0             1.0           33  
1             0.0             0.0             1.0           33  
2             0.0             0.0             1.0           32  
3             0.0             0.0             1.0           33  
4             0.0             0.0             1.0           27  
                                
Table of Medical Examinations:
  Customer ID   

**According to output of above cell, data has transferred correctly**

# Hypothesis Testing
- Test the following null hypothesis

## 1. The average hospitalization costs for the three types of hospitals are not significantly different.

In [16]:
Hospital_tier_1=df_Hospitalisation_details['charges'][df_Hospitalisation_details['Hospital tier']==0]
Hospital_tier_2=df_Hospitalisation_details['charges'][df_Hospitalisation_details['Hospital tier']==1]
Hospital_tier_3=df_Hospitalisation_details['charges'][df_Hospitalisation_details['Hospital tier']==2]
# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(Hospital_tier_1, Hospital_tier_2, Hospital_tier_3)
print("F-statistic:", f_statistic)
print("P-value:", p_value)
print('Since P-value > 0.05 (critical value), tyep of hospitals are not sigificantly different!')


F-statistic: 495.4738392647216
P-value: 5.5985677688061205e-180
Since P-value > 0.05 (critical value), tyep of hospitals are not sigificantly different!


## 2. The average hospitalization costs for the three types of cities are not significantly different.

In [17]:
City_tier_1=df_Hospitalisation_details['charges'][df_Hospitalisation_details['City tier']==0]
City_tier_2=df_Hospitalisation_details['charges'][df_Hospitalisation_details['City tier']==1]
City_tier_3=df_Hospitalisation_details['charges'][df_Hospitalisation_details['City tier']==2]
# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(City_tier_1, City_tier_2, City_tier_3)
print("F-statistic:", f_statistic)
print("P-value:", p_value)
print('Since P-value > 0.05 (critical value), tyep of hospitals are not sigificantly different!')

F-statistic: 1.5506960494320452
P-value: 0.21231948148820598
Since P-value > 0.05 (critical value), tyep of hospitals are not sigificantly different!


## 3. The average hospitalization costs for smokers is not significantly different from the average cost for nonsmokers.

In [18]:
cost_smoker_join=pd.merge(df_Hospitalisation_details[['charges','Customer ID']], 
                          df_Medical_Examinations[['smoker_yes','Customer ID']], on='Customer ID', how='inner')
cost_smoker = cost_smoker_join['charges'][cost_smoker_join['smoker_yes']==1]
cost_non_smoker = cost_smoker_join['charges'][cost_smoker_join['smoker_yes']==0]

# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(cost_smoker, cost_non_smoker)
print("F-statistic:", f_statistic)
print("P-value:", p_value,10)
print('Since P-value < 0.05 (critical value), smoker and non-smoker are sigificantly different!')


F-statistic: 5499.05404908718
P-value: 0.0 10
Since P-value < 0.05 (critical value), smoker and non-smoker are sigificantly different!


## 4. Smoking and heart issues are independent.

In [19]:
df = pd.DataFrame({
    'health issue': df_Medical_Examinations['Heart Issues_yes'],
    'Smoker': df_Medical_Examinations['smoker_yes']
})
# Contingency table
ct = pd.crosstab(df['health issue'], df['Smoker'])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(ct)
print("Contingency Table:")
print(ct)
print(f"\nP-value: {p:.4f}")
print('Since P-value > 0.05 : health issue and being smoker are independet ')

Contingency Table:
Smoker         0.0  1.0
health issue           
0.0           1110  297
1.0            735  191

P-value: 0.8194
Since P-value > 0.05 : health issue and being smoker are independet 
