# U.S. Medical Insurance Costs

In [1]:
import csv
import pandas as pd

In [4]:
insurance_df = pd.read_csv('insurance.csv')
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Understanding the data in the insurance dataframe
The data has the following information: 
- Patient age
- Patient sex
- Patient BMI
- Patient number of children
- Patient Smoking Status
- Patient Location in the US
- Patient yearly medical insurance cost

Total 1338 rows, no missing data

In [11]:
insurance_df.info()
insurance_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


## Analysing the data

### Distribution men & women

In [27]:
count_man = insurance_df[insurance_df['sex']== 'male']['sex'].count()
count_woman = insurance_df[insurance_df['sex']== 'female']['sex'].count()
total_patients = count_man + count_woman
print(f"Count for female: {count_woman} ({100*count_woman/total_patients:.2f}% of patients)")
print(f"Count for male: {count_man} ({100*count_man/total_patients:.2f}% of patients)")
print("Distribution male/female is good balanced")

Count for female: 662 (49.48% of patients)
Count for male: 676 (50.52% of patients)
Distribution male/female is good balanced


### Average patient age

In [22]:
mean_age = insurance_df['age'].mean()
print(f"Average patient age: {mean_age:.2f} years")

Average patient age: 39.21 years


### Average age for patient with at least one child

In [95]:
mean_age = insurance_df[insurance_df['children'] > 0]['age'].mean()
print(f"Average age of a patient that already has one child at least is {mean_age:.1f} years.")

Average age of a patient that already has one child at least is 39.8 years.


### List of US Locations

In [67]:
us_locations = insurance_df['region'].unique()
print(f"There are {len(us_locations)} locations in the data:")

for i in range(len(us_locations)):
    count = insurance_df[insurance_df['region']==us_locations[i]]['region'].count()
    print(f"{us_locations[i-1].title()} with {count} patients ({100*count/total_patients:.1f}% of total)")
        

There are 4 locations in the data:
Northeast with 325 patients (24.3% of total)
Southwest with 364 patients (27.2% of total)
Southeast with 325 patients (24.3% of total)
Northwest with 324 patients (24.2% of total)


### Statistics of Medical Insurance Charges

In [90]:
mean_charge = insurance_df['charges'].mean()
print(f"Average yearly Medical Insurance Charges is {mean_charge:.2f} US dollars.")

min_charge = insurance_df['charges'].min()
print(f"Minimum yearly Medical Insurance Charges is {min_charge:.2f} US dollars.")

max_charge = insurance_df['charges'].max()
print(f"Maximum yearly Medical Insurance Charges is {max_charge:.2f} US dollars.")

Average yearly Medical Insurance Charges is 13270.42 US dollars.
Minimum yearly Medical Insurance Charges is 1121.87 US dollars.
Maximum yearly Medical Insurance Charges is 63770.43 US dollars.


### Comparing medical insurance charges of smokers and non-smokers

In [62]:
num_smoker = insurance_df[insurance_df['smoker']=='yes']['smoker'].count()
num_non_smoker = insurance_df[insurance_df['smoker']=='no']['smoker'].count()
print(f"The data set has {num_smoker} smoker patients and {num_non_smoker} non-smoker patients.")

mean_charge_smoker = insurance_df[insurance_df['smoker']=='yes']['charges'].mean()
mean_charge_non_smoker = insurance_df[insurance_df['smoker']=='no']['charges'].mean()

print(f"A smoker patient pays on average for the medical insurance {mean_charge_smoker:.2f} US dollars.")
print(f"A non-smoker patient pays on average for the medical insurance {mean_charge_non_smoker:.2f} US dollars.")

print(f"Being smoker increases the yerly medical insurance charges by around {mean_charge_smoker/mean_charge_non_smoker:.0f} times.")

The data set has 274 smoker patients and 1064 non-smoker patients.
A smoker patient pays on average for the medical insurance 32050.23 US dollars.
A non-smoker patient pays on average for the medical insurance 8434.27 US dollars.
Being smoker increases the yerly medical insurance charges by around 4 times.


### Medical Insurance Charges per Region

In [75]:
for i in range(len(us_locations)):
    mean_region = insurance_df[insurance_df['region']==us_locations[i]]['charges'].mean()
    print(f"The average yearly medical insurance in {us_locations[i-1].upper()} is {mean_region:.2f} US dollars.")

The average yearly medical insurance in NORTHEAST is 12346.94 US dollars.
The average yearly medical insurance in SOUTHWEST is 14735.41 US dollars.
The average yearly medical insurance in SOUTHEAST is 12417.58 US dollars.
The average yearly medical insurance in NORTHWEST is 13406.38 US dollars.


### Medical Insurance Charges per patients with and without children

In [89]:
no_children_mean_charge = insurance_df[insurance_df['children']==0]['charges'].mean()
with_children_mean_charge = insurance_df[insurance_df['children']>0]['charges'].mean()

print(f"Patient without children pays on average for the medical insurance {no_children_mean_charge:.2f} US dollars.")
print(f"Patient with at least 1 child pays on average for the medical insurance {with_children_mean_charge:.2f} US dollars.")

Patient without children pays on average for the medical insurance 12365.98 US dollars.
Patient with at least 1 child pays on average for the medical insurance 13949.94 US dollars.


### Patients with maximum and minimum insurance charge

In [92]:
print(insurance_df[insurance_df['charges'] == min_charge])
print(insurance_df[insurance_df['charges'] == max_charge])

     age   sex    bmi  children smoker     region    charges
940   18  male  23.21         0     no  southeast  1121.8739
     age     sex    bmi  children smoker     region      charges
543   54  female  47.41         0    yes  southeast  63770.42801


### Impact of the BMI in the insurance price

|BMI | Weight Status|
|:-:|:-:| 
|Below 18.5 | Underweight|
|18.5 — 24.9 | Healthy Weight|
|25.0 — 29.9 |Overweight|
|30.0 and above | Obesity|



In [104]:
mean_underweight = insurance_df[(insurance_df['bmi'] < 18.5)]['charges'].mean()
mean_healthy = insurance_df[(insurance_df['bmi'] >= 18.5) & (insurance_df['bmi'] <= 24.9)]['charges'].mean()
mean_overweight = insurance_df[(insurance_df['bmi'] >= 25.0) & (insurance_df['bmi'] <= 29.9)]['charges'].mean()
mean_obesity = insurance_df[(insurance_df['bmi'] >= 30)]['charges'].mean()

print(f"Patient with Underrweight Weight pays on average for the medical insurance {mean_underweight:.2f} US dollars.")
print(f"Patient with Healthy Weight pays on average for the medical insurance {mean_healthy:.2f} US dollars.")
print(f"Patient with Overweight pays on average for the medical insurance {mean_overweight:.2f} US dollars.")
print(f"Patient with Obesity pays on average for the medical insurance {mean_obesity:.2f} US dollars.")


Patient with Underrweight Weight pays on average for the medical insurance 8852.20 US dollars.
Patient with Healthy Weight pays on average for the medical insurance 10379.50 US dollars.
Patient with Overweight pays on average for the medical insurance 10993.99 US dollars.
Patient with Obesity pays on average for the medical insurance 15552.34 US dollars.
