# U.S. Medical Insurance Costs

In [1]:
# breyenguyen #portfolio project #US Medical Insurance Costs
# import csv library
import csv

In [2]:
# import insurance.csv and assign it to the variable insurance_data_list
insurance_data_list = []
age = []
bmi = []
charges = []
with open('insurance.csv', newline = '') as insurance_csv:
    insurance_data = csv.DictReader(insurance_csv)
    for row in insurance_data:
        insurance_data_list.append(row)
        age.append(row['age'])
        bmi.append(row['bmi'])
        charges.append(row['charges'])
# print(insurance_data_list)

__In this project, I plan to analyze the correlation between the__ `age`, `sex`, `bmi`, `smoker`, `region` __and the__ `charges`. __My ideas for analysis include:__
1. The averages in 4 different regions: `northeast` `northwest` `southeast` `southwest`
2. The difference in medical insurance cost between `smoker` and `non-smoker`
3. The average insurance cost of different age groups: `youth_group`: 15–47 years old, `middle_age`: 48–63 years old, `elderly`: ≥ 64 years old 
4. The difference in medical insurance cost between people with different BMIs. BMI Scale: `underweight`: < 18.5, `ideal`: 18.5 - 24.9, `overweight`: 25 - 29.9, `obesity`: >= 30
5. The average medical insurance cost of two different sexes: `male` and `female`
6. The correlation between different elements: `age`, `sex`, `bmi`, `smoker`, `region`, `children`

In order to conduct those analyses, I develop different functions to create different dictionaries with different keys. Also, I create functions to calculate the percentages, the averages, which can be used with all types of dictionaries. So, they're quite "versatile." 

### Functions for calculations

In [3]:
# Build a function to calculate the percentage of each group
def cal_percentage(insurance_data, data_by_dict, group):
    percentage = (len(data_by_dict[group])/len(insurance_data))*100
    return round(percentage, 1)

# Build a function to calculate the percenteage of smokers in different groups
def cal_percentage_smoker(data_by_dict, group, key = 'smoker'):
    count = 0
    group_data = data_by_dict[group]
    for data in group_data:
        if data[key] == 'yes':
            count += 1
    percentage = (count/len(group_data))*100
    return round(percentage, 1)

# Build a function to calculate the average medical insurance cost for each group
def cal_average(data_by_dict, group, key):
    total = 0
    group_data = data_by_dict[group]
    for data in group_data:
        total += float(data.get(key))
    average = total/len(group_data)
    return round(average, 2)

# Build a function to calculate the NATION's average
def cal_nation_avg(insurance_data, key):
    total = 0
    for data in insurance_data:
        total += float(data[key])
    average = total/len(insurance_data)
    return round(average, 2)

### Functions to create dictionaries

In [22]:
# Build a function to create a dict with smoker/non-smoker as key
def create_smoker_dict(insurance_data):
    data_by_smoker = {"smoker":[], "non-smoker":[]}
    for data in insurance_data:
        if data.get('smoker') == 'yes':
            data_by_smoker['smoker'].append(data)
        elif data.get('smoker') == 'no':
            data_by_smoker['non-smoker'].append(data)
    return data_by_smoker

# Build a function to create a dictionary with region as key
def create_region_dict(insurance_data):
    data_by_region = {"northeast":[], "northwest":[], "southeast":[], "southwest":[]}
    for data in insurance_data:
        for region in data_by_region:
            if data.get('region') == region:
                data_by_region[region].append(data)
    return data_by_region

# Build a function to create a dictionary with age groups as keys
def create_age_dict(insurance_data):
    data_by_age = {'youth group':[], 'middle aged':[], 'elderly':[]}
    for data in insurance_data:
        if float(data['age']) >= 64:
            data_by_age['elderly'].append(data)
        elif float(data['age']) >= 48:
            data_by_age['middle aged'].append(data)
        else:
            data_by_age['youth group'].append(data)
    return data_by_age

# Build a function to create a dictionary with bmi as key
def create_bmi_dict(insurance_data):
    data_by_bmi = {'underweight':[], 'ideal':[], 'overweight':[], 'obesity':[]}
    for data in insurance_data:
        if float(data['bmi']) >= 30:
            data_by_bmi['obesity'].append(data)
        elif float(data['bmi']) >= 25:
            data_by_bmi['overweight'].append(data)
        elif float(data['bmi']) >= 18.5:
            data_by_bmi['ideal'].append(data)
        else:
            data_by_bmi['underweight'].append(data)
    return data_by_bmi

# Build a function to create a dictionary with sex as key
def create_sex_dict(insurance_data):
    data_by_sex = {'male':[], 'female':[]}
    for data in insurance_data:
        for sex in data_by_sex:
            if data['sex'] == sex:
                data_by_sex[sex].append(data)
    return data_by_sex

### Analysis 1: Smoker vs. Non-smoker
In this analysis, I'm gonna look at the difference in medical insurance cost between smokers and non-smokers. This is to see if smoking is one of the factors that contributes most strongly to low and/or high medical insurance charges.

Also, I'd like to know if smoking affects people's weight in the U.S.

In [23]:
data_by_smoker = create_smoker_dict(insurance_data_list)
#print(data_by_smoker)

# Average medical insurance cost for smokers/non-smokers
avg_cost_by_smoker = {'nation':cal_nation_avg(insurance_data_list, 'charges')}
for smoker in data_by_smoker:
    avg_cost_by_smoker[smoker] = cal_average(data_by_smoker, smoker, 'charges')

print("The difference in medical insurance cost between people with different smoking habits:\n{}".format(avg_cost_by_smoker))

# Average bmi of smokers/non-smokers
avg_bmi_by_smoker = {'nation':cal_nation_avg(insurance_data_list, 'bmi')}
for smoker in data_by_smoker:
    avg_bmi_by_smoker[smoker] = cal_average(data_by_smoker, smoker, 'bmi')

print("The BMI of people with different smoking habits:\n{}".format(avg_bmi_by_smoker))

The difference in medical insurance cost between people with different smoking habits:
{'nation': 13270.42, 'smoker': 32050.23, 'non-smoker': 8434.27}
The BMI of people with different smoking habits:
{'nation': 30.66, 'smoker': 30.71, 'non-smoker': 30.65}


### Analysis 2: The Four Regions
In this analysis, I'm gonna look at the number of patients from four regions in the U.S. to see if the dataset is diverse and representative of the population. Also, I check the correlation between region and different patient attributes.

In [21]:
data_by_region = create_region_dict(insurance_data_list)
# print(data_by_region)

# Percentage of subjects from different regions
region_percentage = {}
for region in data_by_region:
    region_percentage[region] = cal_percentage(insurance_data_list, data_by_region, region)
    
print("The percentage of patients in each region:\n{}".format(region_percentage))

# Average medical insurance cost by region
avg_cost_by_region = {'nation':cal_nation_avg(insurance_data_list, 'charges')}
for region in data_by_region:
    avg_cost_by_region[region] = cal_average(data_by_region, region, 'charges')

print("The average insurance cost in different regions:\n{}".format(avg_cost_by_region))

# Average bmi by region
avg_bmi_by_region = {'nation':cal_nation_avg(insurance_data_list, 'bmi')}
for region in data_by_region:
    avg_bmi_by_region[region] = cal_average(data_by_region, region, 'bmi')\

print("The average bmi in different regions:\n{}".format(avg_bmi_by_region))
    
# Average age by region
avg_age_by_region = {'nation':cal_nation_avg(insurance_data_list, 'age')}
for region in data_by_region:
    avg_age_by_region[region] = cal_average(data_by_region, region, 'age')

print("The average age in different regions:\n{}".format(avg_age_by_region))

# Percentage of smokers by region
smoker_percentage_region = {'nation':cal_percentage(insurance_data_list, data_by_smoker, 'smoker')}
for region in data_by_region:
    smoker_percentage_region[region] = cal_percentage_smoker(data_by_region, region)

print("The percentage of smokers in each region:\n{}".format(smoker_percentage_region))

The percentage of patients in each region:
{'northeast': 24.2, 'northwest': 24.3, 'southeast': 27.2, 'southwest': 24.3}
The average insurance cost in different regions:
{'nation': 13270.42, 'northeast': 13406.38, 'northwest': 12417.58, 'southeast': 14735.41, 'southwest': 12346.94}
The average bmi in different regions:
{'nation': 30.66, 'northeast': 29.17, 'northwest': 29.2, 'southeast': 33.36, 'southwest': 30.6}
The average age in different regions:
{'nation': 39.21, 'northeast': 39.27, 'northwest': 39.2, 'southeast': 38.94, 'southwest': 39.46}
The percentage of smokers in each region:
{'nation': 20.5, 'northeast': 20.7, 'northwest': 17.8, 'southeast': 25.0, 'southwest': 17.8}


### Analysis 3: Age Groups
According to WHO, Ages can be classified in to groups including `youth_group`: 15–47 years old, `middle_age`: 48–63 years old, `elderly`: ≥ 64 years old.

In this analysis, I'm gonna look at the correlation between ages and medical insurance costs. Also, we'd like to see if the dataset is representative of the population in terms of age. 

In [24]:
data_by_age = create_age_dict(insurance_data_list)
# print(data_by_age)

# Percentage of age group in population
age_group_percentage = {}
for age in data_by_age:
    age_group_percentage[age] = cal_percentage(insurance_data_list, data_by_age, age)

print("The percentage of patients from each age group:\n{}".format(age_group_percentage))

# Average medical insurance cost by age
avg_cost_by_age = {'nation':cal_nation_avg(insurance_data_list, 'charges')}
for age in data_by_age:
    avg_cost_by_age[age] = cal_average(data_by_age, age, 'charges')

print("The average insurance cost for each age group:\n{}".format(avg_cost_by_age))

# Percentage of smokers by age
smoker_percentage_age = {'nation':cal_percentage(insurance_data_list, data_by_smoker, 'smoker')}
for age in data_by_age:
    smoker_percentage_age[age] = cal_percentage_smoker(data_by_age, age)

print("The percentage of smokers in each age group:\n{}".format(smoker_percentage_age))

The percentage of patients from each age group:
{'youth group': 67.0, 'middle aged': 31.4, 'elderly': 1.6}
The average insurance cost for each age group:
{'nation': 13270.42, 'youth group': 11253.92, 'middle aged': 17048.22, 'elderly': 23275.53}
The percentage of smokers in each age group:
{'nation': 20.5, 'youth group': 22.0, 'middle aged': 16.7, 'elderly': 31.8}


### Analysis 4: BMIs
BMI Scale: `underweight`: < 18.5, `ideal`: 18.5 - 24.9, `overweight`: 25 - 29.9, `obesity`: >= 30

Here I'd like to look at the correlation between BMI and insuran cost as well as BMI and age.

In [28]:
data_by_bmi = create_bmi_dict(insurance_data_list)
# print(data_by_bmi)

# Average medical insurance cost for people with diferrent bmis
avg_cost_by_bmi = {'nation':cal_nation_avg(insurance_data_list, 'charges')}
for bmi in data_by_bmi:
    avg_cost_by_bmi[bmi] = cal_average(data_by_bmi, bmi, 'charges')

print("The average insurance cost for people with different BMIs:\n{}".format(avg_cost_by_bmi))

# Average age of people from different age groups
avg_age_by_bmi = {'nation':cal_nation_avg(insurance_data_list, 'age')}
for bmi in data_by_bmi:
    avg_age_by_bmi[bmi] = cal_average(data_by_bmi, bmi, 'age')

print("The average age in different BMI groups:\n{}".format(avg_age_by_bmi))

# Percentage of smokers by bmi
smoker_percentage_bmi = {'nation':cal_percentage(insurance_data_list, data_by_smoker, 'smoker')}
for bmi in data_by_bmi:
    smoker_percentage_bmi[bmi] = cal_percentage_smoker(data_by_bmi, bmi)

print("The percentage of smokers in each BMI group:\n{}".format(smoker_percentage_bmi))

The average insurance cost for people with different BMIs:
{'nation': 13270.42, 'underweight': 8852.2, 'ideal': 10409.34, 'overweight': 10987.51, 'obesity': 15552.34}
The average age in different BMI groups:
{'nation': 39.21, 'underweight': 32.35, 'ideal': 36.73, 'overweight': 38.92, 'obesity': 40.34}
The percentage of smokers in each BMI group:
{'nation': 20.5, 'underweight': 25.0, 'ideal': 22.2, 'overweight': 19.2, 'obesity': 20.5}


### Analysis 5: Male vs. Female

Here I'd like to see if the dataset is representative of the population in terms of sexes. Also, I check the correlation between sexes and other attributes

In [30]:
data_by_sex = create_sex_dict(insurance_data_list)
# print(data_by_sex['male'])
sex_group_percentage = {}
for sex in data_by_sex:
    sex_group_percentage[sex] = cal_percentage(insurance_data_list, data_by_sex, sex)

print("The percentage of male and female patients:\n{}".format(sex_group_percentage))

# Percentage of males and females:

# Average medical insurance cost for males/females
avg_cost_by_sex = {'nation':cal_nation_avg(insurance_data_list, 'charges')}
for sex in data_by_sex:
    avg_cost_by_sex[sex] = cal_average(data_by_sex, sex, 'charges')

print("The average insurance cost for male and female patients:\n{}".format(avg_cost_by_sex))

# Average age by sex
avg_age_by_sex = {'nation':cal_nation_avg(insurance_data_list, 'age')}
for sex in data_by_sex:
    avg_age_by_sex[sex] = cal_average(data_by_sex, sex, 'age')

print("The average age of male and female patients:\n{}".format(avg_age_by_sex))

# Average bmi by sex
avg_bmi_by_sex = {'nation':cal_nation_avg(insurance_data_list, 'bmi')}
for sex in data_by_sex:
    avg_bmi_by_sex[sex] = cal_average(data_by_sex, sex, 'bmi')

print("The average bmi of male and female patients:\n{}".format(avg_bmi_by_sex))

# Percentage of smokers by sex
smoker_percentage_sex = {'nation':cal_percentage(insurance_data_list, data_by_smoker, 'smoker')}
for sex in data_by_sex:
    smoker_percentage_sex[sex] = cal_percentage_smoker(data_by_sex, sex)

print("The percentage of smokers among male and female patients:\n{}".format(smoker_percentage_sex))

The percentage of male and female patients:
{'male': 50.5, 'female': 49.5}
The average insurance cost for male and female patients:
{'nation': 13270.42, 'male': 13956.75, 'female': 12569.58}
The average age of male and female patients:
{'nation': 39.21, 'male': 38.92, 'female': 39.5}
The average bmi of male and female patients:
{'nation': 30.66, 'male': 30.94, 'female': 30.38}
The percentage of smokers among male and female patients:
{'nation': 20.5, 'male': 23.5, 'female': 17.4}
