# U.S. Medical Insurance Costs

In [69]:
#preparation
import csv
from decimal import Decimal


with open("insurance.csv") as insurance_data:
    data = csv.DictReader(insurance_data)
    ages = []
    sexes = []
    bmis = []
    numbers_of_children = []
    smoker_statuses = []
    regions = []
    charges = []
    for line in data:
        ages.append(int(line["age"]))
        sexes.append(line["sex"])
        bmis.append(line["bmi"])
        numbers_of_children.append(line["children"])
        smoker_statuses.append(line["smoker"])
        regions.append(line["region"])
        charges.append(line["charges"])


#perform some general analyses
total_count = 0
for entry in ages:
    total_count += 1
print("The dataset has {} entries".format(total_count))

total_cost = 0
for charge in charges:
    total_cost += float(charge)
average_cost = total_cost/len(charges)
print("The average charge is {charge}.".format(charge=average_cost))

total_age = 0
for age in ages:
    total_age += float(age)
average_age = total_age/len(ages)
print("The average age is {age}.".format(age=average_age))

pop_25 = 0
pop_40 = 0
pop_55 = 0
pop_O55 = 0

for i in range(len(ages)):
    if ages[i] <= 25:
        pop_25 += 1
    elif ages[i] <= 40:
        pop_40 += 1
    elif ages[i] <= 55:
        pop_55 += 1
    elif ages[i] > 50:
        pop_O55 += 1

print("Demographics:\nage < 25: {}\n25 < age <= 40: {}\n40 < age <= 55: {}\nage > 55: {}".format(pop_25, pop_40,pop_55,pop_O55))        
        
total_children = 0
for number_of_children in numbers_of_children:
    total_children += float(number_of_children)
average_number_of_children = total_children/len(numbers_of_children)
print("The number of children is {average}.".format(average=average_number_of_children))

children_dict = {}

for number_of_children in numbers_of_children:
    if number_of_children not in children_dict:
        children_dict[number_of_children] = 1
    else:
        n = children_dict[number_of_children]
        children_dict.update({number_of_children:n+1})
print("Children are distributed as follows: " + str(children_dict))

The dataset has 1338 entries
The average charge is 13270.422265141257.
The average age is 39.20702541106129.
Demographics:
age < 25: 306
25 < age <= 40: 395
40 < age <= 55: 421
age > 55: 216
The number of children is 1.0949177877429.
Children are distributed as follows: {'0': 574, '1': 324, '3': 157, '2': 240, '5': 18, '4': 25}


In [71]:
#region analysis
regions_analysis = {}
for region in regions:
    if region not in regions_analysis:
        regions_analysis[region] = 0
    else:
        i = regions_analysis[region]
        regions_analysis.update({region:i+1})

print("Regions are distributed as follows: " + str(regions_analysis))

Regions are distributed as follows: {'southwest': 324, 'southeast': 363, 'northwest': 324, 'northeast': 323}


In [67]:
#smoker analysis
total_smoker_charge = 0
total_non_smoker_charge = 0
number_of_smokers = smoker_statuses.count("yes")
number_of_non_smokers = smoker_statuses.count("no")
print("There are {smokers} smokers and {non_smokers} non-smokers in the dataset.".format(smokers=number_of_smokers,non_smokers=number_of_non_smokers) )

tot_smokers_25 = 0
tot_smokers_40 = 0
tot_smokers_55 = 0
tot_smokers_O55 = 0

for i in range(len(ages)):
    if ages[i] <= 25 and smoker_statuses[i] == "yes":
        tot_smokers_25 += 1
    elif ages[i] <= 40 and smoker_statuses[i] == "yes":
        tot_smokers_40 += 1
    elif ages[i] <= 55 and smoker_statuses[i] == "yes":
        tot_smokers_55 += 1
    elif ages[i] > 50 and smoker_statuses[i] == "yes":
        tot_smokers_O55 += 1

smoker25_percent = tot_smokers_25 / pop_25 *100
smoker40_percent = tot_smokers_40 / pop_40 *100
smoker55_percent = tot_smokers_55 / pop_55 *100
smokerO55_percent = tot_smokers_O55 / pop_O55 *100

print("Smokers by age:\nage <= 25: {}%\n25 < age <= 40: {}%\n40 < age <= 55: {}%\nage > 55: {}%".format(smoker25_percent, smoker40_percent, smoker55_percent, smokerO55_percent))
    

for i in range(len(smoker_statuses)):
    if smoker_statuses[i] == "yes":
        total_smoker_charge += float(charges[i])
    else:
        total_non_smoker_charge += float(charges[i])

avg_smoker_charge = total_smoker_charge/number_of_smokers
avg_non_smoker_charge = total_non_smoker_charge/number_of_non_smokers

deviation = (avg_smoker_charge/avg_non_smoker_charge)*100 -100

print("Smokers on average pay {}% more than non-smokers".format(deviation))
    

There are 274 smokers and 1064 non-smokers in the dataset.
Smokers by age:
age <= 25: 21.241830065359476%
25 < age <= 40: 21.265822784810126%
40 < age <= 55: 20.19002375296912%
age > 55: 18.51851851851852%
Smokers on average pay 280.00014582983204% more than non-smokers
