# U.S. Medical Insurance Costs

### I want to know which region has the highest costs, and then determine which factors might be driving the abnormal costs.

In [1]:
# Import libraries.
import csv

In [2]:
# Open data file and convert to list of Python dictionaries.
insurance_entries = []
with open('insurance.csv') as insurance_data:
    insurance_reader = csv.DictReader(insurance_data)
    for item in insurance_reader:
        insurance_entries.append(item)

In [3]:
# Separate the dictionary into lists of dictionaries by region.

regions = ['Southeast', 'Southwest', 'Northeast', 'Northwest']
southeast_ents = []
southwest_ents = []
northeast_ents = []
northwest_ents = []

def region_lists(ins_ents_list):
     for entry in ins_ents_list:
        if entry['region'] == 'southeast':
            southeast_ents.append(entry)
        elif entry['region'] == 'southwest':
            southwest_ents.append(entry)
        elif entry['region'] == 'northeast':
            northeast_ents.append(entry)
        elif entry['region'] == 'northwest':
            northwest_ents.append(entry)
        else:
            continue

region_lists(insurance_entries)

In [4]:
# Find average cost by region, and identify the region with the highest average costs.

def region_av_cost(region_ents_list):
    total_cost = 0 
    for entry in region_ents_list:
        total_cost += float(entry['charges'])
    return round(total_cost/len(region_ents_list),4)

se_av_cost = region_av_cost(southeast_ents)
sw_av_cost = region_av_cost(southwest_ents)
ne_av_cost = region_av_cost(northeast_ents)
nw_av_cost = region_av_cost(northwest_ents)

av_costs = [se_av_cost, sw_av_cost, ne_av_cost, nw_av_cost]
max_reg_costs = max(av_costs)
costs_dict = dict(list(zip(av_costs, regions)))
print('The region with the highest average insurance cost is the ' + costs_dict[max_reg_costs] + ' region.')

The region with the highest average insurance cost is the Southeast region.


### Now that I know the Southeast is the region with the highest average costs, I can investigate the average bmis, ages, number of children, sex rates and smoker rates across all regions. 

In [5]:
# Compare average bmi for all regions. Indicate which region has the highest average bmi, and by how much.

def bmi_avs(region_ents_list):
    total_bmi = 0
    for entry in region_ents_list:
        total_bmi += float(entry['bmi'])
    return round(total_bmi/len(region_ents_list),4)

se_av_bmi = bmi_avs(southeast_ents)
sw_av_bmi = bmi_avs(southwest_ents)
ne_av_bmi = bmi_avs(northeast_ents)
nw_av_bmi = bmi_avs(northwest_ents)

av_bmi = [se_av_bmi, sw_av_bmi, ne_av_bmi, nw_av_bmi]
max_reg_bmi = max(av_bmi)
bmi_dict = dict(list(zip(av_bmi, regions)))

av_bmi_sorted = sorted(av_bmi)
sec_largest_bmi = av_bmi_sorted[-2]
perc_diff_bmi = round(((max_reg_bmi/sec_largest_bmi)-1)*100, 2)

print('The region with the highest average BMI is the ' + bmi_dict[max_reg_bmi] + ' region. The average BMI there is ' + str(max_reg_bmi) + ', which is ' + str(perc_diff_bmi) + '% higher than the second highest average regional BMI.' )

The region with the highest average BMI is the Southeast region. The average BMI there is 33.356, which is 9.02% higher than the second highest average regional BMI.


In [6]:
# Compare average age for all regions. Indicate which region has the highest average age, and by how much.

def age_avs(region_ents_list):
    total_age = 0
    for entry in region_ents_list:
        total_age += float(entry['age'])
    return round(total_age/len(region_ents_list),4)

se_av_age = age_avs(southeast_ents)
sw_av_age = age_avs(southwest_ents)
ne_av_age = age_avs(northeast_ents)
nw_av_age = age_avs(northwest_ents)

av_age = [se_av_age, sw_av_age, ne_av_age, nw_av_age]
max_reg_age = max(av_age)
age_dict = dict(list(zip(av_age, regions)))

av_age_sorted = sorted(av_age)
sec_largest_age = av_age_sorted[-2]
perc_diff_age = round(((max_reg_age/sec_largest_age)-1)*100, 2)

print('The region with the highest average age is the ' + age_dict[max_reg_age] + ' region. The average age there is ' + str(max_reg_age) + ', which is ' + str(perc_diff_age) + '% higher than the second highest average regional age.' )

The region with the highest average age is the Southwest region. The average age there is 39.4554, which is 0.48% higher than the second highest average regional age.


In [7]:
# Compare average number of children for all regions. Indicate which region has the highest average number of children, and by how much.

def num_children_avs(region_ents_list):
    total_num_children = 0
    for entry in region_ents_list:
        total_num_children += float(entry['children'])
    return round(total_num_children/len(region_ents_list),2)

se_av_num_children = num_children_avs(southeast_ents)
sw_av_num_children = num_children_avs(southwest_ents)
ne_av_num_children = num_children_avs(northeast_ents)
nw_av_num_children = num_children_avs(northwest_ents)

av_num_children = [se_av_num_children, sw_av_num_children, ne_av_num_children, nw_av_num_children]
max_reg_num_children = max(av_num_children)
num_children_dict = dict(list(zip(av_num_children, regions)))

av_num_children_sorted = sorted(av_num_children)
sec_largest_num_children = av_num_children_sorted[-2]
perc_diff_num_children = round(((max_reg_num_children/sec_largest_num_children)-1)*100, 2)

print('The region with the highest average number of children is the ' + num_children_dict[max_reg_num_children] + ' region. The average number of children there is ' + str(max_reg_num_children) + ', which is ' + str(perc_diff_num_children) + '% higher than the second highest average regional number of children.' )

The region with the highest average number of children is the Northwest region. The average number of children there is 1.15, which is 0.88% higher than the second highest average regional number of children.


In [8]:
# Compare the percentages of the population that are male for all regions. Indicate which region has the highest percentage of men, and by how much.

def men_avs(region_ents_list):
    total_men = 0
    for entry in region_ents_list:
        if entry['sex'] == 'male':
            total_men += 1
        else:
            continue
    return round((total_men/len(region_ents_list))*100,4)

se_perc_male = men_avs(southeast_ents)
sw_perc_male = men_avs(southwest_ents)
ne_perc_male = men_avs(northeast_ents)
nw_perc_male = men_avs(northwest_ents)

av_perc_male = [se_perc_male, sw_perc_male, ne_perc_male, nw_perc_male]
max_reg_perc_male = max(av_perc_male)
perc_male_dict = dict(list(zip(av_perc_male, regions)))

av_perc_male_sorted = sorted(av_perc_male)
sec_largest_perc_male = av_perc_male_sorted[-2]
perc_diff_perc_male = round((max_reg_perc_male-sec_largest_perc_male), 2)

print('The region with the highest male population is the ' + perc_male_dict[max_reg_perc_male] + ' region. The male population there is ' + str(max_reg_perc_male) + '%, which is ' + str(perc_diff_perc_male) + '% higher than the second highest regional male population.' )


The region with the highest male population is the Southeast region. The male population there is 51.9231%, which is 1.61% higher than the second highest regional male population.


In [9]:
# Compare the percentages of the population that smokes for all regions. Indicate which region has the highest percentage of smokers, and by how much.

def smoker_avs(region_ents_list):
    total_smoker = 0
    for entry in region_ents_list:
        if entry['smoker'] == 'yes':
            total_smoker += 1
        else:
            continue
    return round((total_smoker/len(region_ents_list))*100,4)

se_perc_smoker = smoker_avs(southeast_ents)
sw_perc_smoker = smoker_avs(southwest_ents)
ne_perc_smoker = smoker_avs(northeast_ents)
nw_perc_smoker = smoker_avs(northwest_ents)

av_perc_smoker = [se_perc_smoker, sw_perc_smoker, ne_perc_smoker, nw_perc_smoker]
max_reg_perc_smoker = max(av_perc_smoker)
perc_smoker_dict = dict(list(zip(av_perc_smoker, regions)))

av_perc_smoker_sorted = sorted(av_perc_smoker)
sec_largest_perc_smoker = av_perc_smoker_sorted[-2]
perc_diff_perc_smoker = round((max_reg_perc_smoker-sec_largest_perc_smoker), 2)

print('The region with the highest smoker population is the ' + perc_smoker_dict[max_reg_perc_smoker] + ' region. The smoker population there is ' + str(max_reg_perc_smoker) + '%, which is ' + str(perc_diff_perc_smoker) + '% higher than the second highest regional smoker population.' )


The region with the highest smoker population is the Southeast region. The smoker population there is 25.0%, which is 4.32% higher than the second highest regional smoker population.


### The Southeast leads the pack in 3 of 5 categories - BMI, male population, and smoker status. All of these may contribute to high insurance costs in that region. Further statistical analysis and hypthesis testing will reveal if any of those factors are strong drivers for the high costs.