# U.S. Medical Insurance Costs

### Project goals:
1. Determine the number of males and females in the dataset to identify if there is any gender bias in the data
2. Calculate average insurance cost by age groups
3. Calculate average insurance cost by regions
4. Calculate average insurance cost for individuals with and without children
5. Calculate average insurance cost by smoker status
6. Create a JSON file for the results

In [1]:
import csv

In [9]:
# Reading the insurance.csv file and storing the contents of each column in a list
age = []
sex = []
bmi = []
children = []
smoker = []
region = []
charges = []

with open("insurance.csv") as file:
    insurance_csv = csv.DictReader(file)
    for line in insurance_csv:
        age.append(line["age"])
        sex.append(line["sex"])
        bmi.append(line["bmi"])
        children.append(line["children"])
        smoker.append(line["smoker"])
        region.append(line["region"])
        charges.append(line["charges"])

1. Determine the number of males and females in the dataset to identify if there is any gender bias in the data

In [25]:
# Iterating through the sex list to find the number of males and females
no_of_males = 0
no_of_females = 0

for item in sex:
    if item == "female":
        no_of_females += 1
    elif item == "male":
        no_of_males += 1

print("The number of males in the dataset is {}".format(no_of_males))
print("The number of females in the dataset is {}".format(no_of_females))

The number of males in the dataset is 676
The number of females in the dataset is 662


It can be observed that the number of males and females in the dataset is fairly equal. Therefore, the dataset does not appear to be biased.

2. Calculate average insurance cost by age groups

In [46]:
# Finding the minimum and maximum ages in the dataset
print("The minimum age in the dataset is", min(age))
print("The maximum age in the dataset is", max(age))

The minimum age in the dataset is 18
The maximum age in the dataset is 64


In [71]:
# Creating a function to divide the data into the following age groups: 18-25, 26-35, 36-45, 46-55, 56-65, and calculate the average insurance cost for each group
def find_average_cost_by_age_group(age, charges):
    age_groups = {"18-25": 0, "26-35":0, "36-45":0, "46-55":0, "56-65":0}
    total_cost_by_age_group = {key: 0 for key in age_groups}
    average_cost_by_age_group = {key: 0 for key in age_groups}
    # Finding the total number of individuals and total insurance cost for each age group
    for i in range(len(age)):
        years = int(age[i])
        cost = float(charges[i])
        if years > 18 and years < 25:
            key = "18-25"
        elif years > 26 and years < 35:
            key = "26-35"
        elif years > 36 and years < 45:
            key = "36-45"
        elif years > 46 and years < 55:
            key = "46-55"
        else:
            key = "56-65"
        age_groups[key] += 1
        total_cost_by_age_group[key] += cost
    # Finding the average insurance cost for each age group
    for key in total_cost_by_age_group:
        average_cost_by_age_group[key] = round(total_cost_by_age_group[key] / age_groups[key], 3)
    return average_cost_by_age_group

average_cost_by_age_group = find_average_cost_by_age_group(age, charges)

In [73]:
print(average_cost_by_age_group)

{'18-25': 9646.907, '26-35': 10968.73, '36-45': 13462.345, '46-55': 16174.962, '56-65': 14421.443}


3. Calculate average insurance cost by regions

In [62]:
# Creating a function to find the average insurance cost by region
def find_average_cost_by_region(region, charges):
    no_by_region = {}
    total_cost_by_region = {}
    average_cost_by_region = {}
    # Finding the total number of individuals and total cost for each region
    for i in range(len(region)):
        area = region[i]
        if no_by_region.get(area) == None:
            no_by_region[area] = 1
            total_cost_by_region[area] = float(charges[i])
        else:
            no_by_region[area] += 1
            total_cost_by_region[area] += float(charges[i])
    # Finding the average cost for each region
    for area in no_by_region:
        average_cost_by_region[area] = round(total_cost_by_region[area] / no_by_region[area], 3)
    return average_cost_by_region

average_cost_by_region = find_average_cost_by_region(region, charges)

In [64]:
print(average_cost_by_region)

{'southwest': 12346.937, 'southeast': 14735.411, 'northwest': 12417.575, 'northeast': 13406.385}


4. Calculate average insurance cost for individuals with and without children

In [85]:
# Creating a function to find the average insurance cost for individuals with and without children
def find_average_cost_with_and_without_children(children, charges):
    with_or_without_children = {"at least one child": 0, "no children": 0}
    total_cost_by_children = {"at least one child": 0, "no children": 0}
    average_cost_with_and_without_children = {"at least one child": 0, "no children": 0}
    # Finding the total number of individuals and the total cost for each category 
    for i in range(len(children)):
        if int(children[i]) == 0:
            key = "no children"
        else:
            key = "at least one child"
        with_or_without_children[key] += 1
        total_cost_by_children[key] += float(charges[i])
    # Finding the average cost for each category
    for key in with_or_without_children:
        average_cost_with_and_without_children[key] = round(total_cost_by_children[key] / with_or_without_children[key], 3)
    return average_cost_with_and_without_children

average_cost_with_and_without_children = find_average_cost_with_and_without_children(children ,charges)

In [87]:
print(average_cost_with_and_without_children)

{'at least one child': 13949.941, 'no children': 12365.976}


5. Calculate average insurance cost by smoker status

In [107]:
# Creating a function to find the average insurance cost by smoker status
def find_average_cost_by_smoker_status(smoker, charges):
    smoker_status_count = {"smoker": 0, "non smoker": 0}
    total_cost_by_smoker_status = {"smoker": 0, "non smoker": 0}
    average_cost_by_smoker_status = {"smoker": 0, "non smoker": 0}
    # Finding the total number of individuals and total cost by smoker status
    for i in range(len(smoker)):
        if smoker[i] == "yes":
            key = "smoker"
        else:
            key = "non smoker"
        smoker_status_count[key] += 1
        total_cost_by_smoker_status[key] += float(charges[i])
    # Finding the average cost by smoker status
    for key in smoker_status_count:
        average_cost_by_smoker_status[key] = round(total_cost_by_smoker_status[key] / smoker_status_count[key], 3)
    return average_cost_by_smoker_status

average_cost_by_smoker_status = find_average_cost_by_smoker_status(smoker, charges)

In [109]:
print(average_cost_by_smoker_status)

{'smoker': 32050.232, 'non smoker': 8434.268}


6. Create a JSON file for the results

In [126]:
# Importing the json library and writing the analysis results to a json file
import json
analysis_results = {"Average insurance cost by age groups": average_cost_by_age_group,
                    "Average insurance cost by regions": average_cost_by_region, 
                    "Average insurance cost for individuals with and without children": average_cost_with_and_without_children, 
                    "Average insurance cost by smoker status": average_cost_by_smoker_status}
with open("us_medical_insurance_costs.json", "w") as file:
    json.dump(analysis_results, file)

In [132]:
with open("us_medical_insurance_costs.json") as file:
    data = json.load(file)
    print(data)

{'Average insurance cost by age groups': {'18-25': 9646.907, '26-35': 10968.73, '36-45': 13462.345, '46-55': 16174.962, '56-65': 14421.443}, 'Average insurance cost by regions': {'southwest': 12346.937, 'southeast': 14735.411, 'northwest': 12417.575, 'northeast': 13406.385}, 'Average insurance cost for individuals with and without children': {'at least one child': 13949.941, 'no children': 12365.976}, 'Average insurance cost by smoker status': {'smoker': 32050.232, 'non smoker': 8434.268}}
