# U.S. Medical Insurance Costs

### Import data and create initial lists
Using CSV library for reading CSV data source. List created for each paramter to easily clean for evaluation.

In [8]:
import csv

Initialize lists

In [9]:
age = []
sex = []
bmi = []
children = []
smoker = []
region = []
charges = []

Load data to lists

In [10]:
with open('insurance.csv') as ins_data:
    ins_dict = csv.DictReader(ins_data)
    for i in ins_dict:
        age.append(i['age'])
        sex.append(i['sex'])
        bmi.append(i['bmi'])
        children.append(i['children'])
        smoker.append(i['smoker'])
        region.append(i['region'])
        charges.append(i['charges'])

#### Adjust lists to flags to use for evaluation
1. age: adjusted to integer values
2. sex: 1 for male, 0 for female
3. bmi: adjusted to float
4. children: adjusted to int
5. smoker: 1 for smoker, 0 for nonsmoker
6. region left as is for grouping for now
7. charges: adjusted to float

In [11]:
age = [int(i) for i in age]
sex = [1 if i == 'male' else 0 for i in sex]
bmi = [float(i) for i in bmi]
children = [int(i) for i in children]
smoker = [1 if i == 'yes' else 0 for i in smoker]
charges = [float(i) for i in charges]

unique_regions = []
for i in region:
    if i not in unique_regions:
        unique_regions.append(i)

Defined a function to get a glimpse of the data by region

In [18]:
# determine a formula
def overall(region_in=''):
        total = 0
        sum_age = 0
        sum_bmi = 0.0
        sum_child = 0
        smoker_count = 0
        male_count = 0
        sum_charge = 0.0
        for i in range(len(region)):
            if region_in in region[i]:
                total += 1
                sum_age += age[i]
                sum_bmi += bmi[i]
                sum_child += children[i]
                smoker_count += smoker[i]
                male_count += sex[i]
                sum_charge += charges[i]
        if total == 0:
            print("There are no customers in this region.")
        else:
            print("Region: {region}\nTotal Records: {total}\nAverage Age: {age}\nAverage BMI: {bmi}\nAverage Children: {child}\nSmoker Percent: {smoker}\nPercent Male: {male}\nAverage Charge: {charge}"
                  .format(
                      region = region_in, 
                      total = total, 
                      age = round(sum_age/total,2), 
                      bmi = round(sum_bmi/total,2), 
                      child = round(sum_child/total,2),
                      smoker = str(round((smoker_count/total)*100,2)) + '%',
                      male = str(round((male_count/total)*100,2)) + '%',
                      charge = round(sum_charge/total,2)
                  ))

In [19]:
for i in unique_regions:
    overall(i)
    print("")

Region: southwest
Total Records: 325
Average Age: 39.46
Average BMI: 30.6
Average Children: 1.14
Smoker Percent: 17.85%
Percent Male: 50.15%
Average Charge: 12346.94

Region: southeast
Total Records: 364
Average Age: 38.94
Average BMI: 33.36
Average Children: 1.05
Smoker Percent: 25.0%
Percent Male: 51.92%
Average Charge: 14735.41

Region: northwest
Total Records: 325
Average Age: 39.2
Average BMI: 29.2
Average Children: 1.15
Smoker Percent: 17.85%
Percent Male: 49.54%
Average Charge: 12417.58

Region: northeast
Total Records: 324
Average Age: 39.27
Average BMI: 29.17
Average Children: 1.05
Smoker Percent: 20.68%
Percent Male: 50.31%
Average Charge: 13406.38



#### Conclusions for regions:
1. Age and number of customers are fairly consistent across regions
2. BMI is highest in the southeast, at about 10% higher than other regions
3. The majority of smokers are also in the southeast
4. BMI & smoker % explain why, on average, the cost for medical insurance is higher in the southeast

create test cases to narrow down possibilities and reverse engineer cost formula

In [32]:
def reverse_cost(age, age_mult, bmi, bmi_mult, child, child_mult, smoker, smoker_mult, sex, sex_mult):
    total_charge = 0.0
    total_charge = age*age_mult + bmi*bmi_mult + child*child_mult + smoker*smoker_mult + sex*sex_mult
    return total_charge

In [21]:
def test_error(charge, predict_charge):
    return abs(charge-predict_charge)

Run tests cases for potential matching mults on parameters to get the charge. <br>
Get results for aggregate error of each mult combination and find the most accurate <br>
If best mult is the limit of the range tested, expand range to test a higher limit.

In [30]:
def test_cases(age, age_mult, bmi, bmi_mult, child, child_mult, smoker, smoker_mult, sex, sex_mult, charges):
    smallest_error = float('inf')
    test_charge = 0.0
    close_age_mult = 0.0
    close_bmi_mult = 0.0
    close_child_mult = 0.0
    close_smoker_mult = 0
    close_sex_mult = 0
    for t in range(100):
        for j in age_mult:
                for l in bmi_mult:
                        for n in child_mult:
                                for p in smoker_mult:
                                        for r in sex_mult:
                                            tests = 0
                                            trial_error = 0.0
                                            for i in range(len(age)//10):
                                                test_charge = reverse_cost(age[i], j, bmi[i], l, child[i], n, smoker[i], p, sex[i], r)
                                                trial_error += test_error(charges[i], test_charge)
                                                tests += 1
                                            if round(trial_error/tests,2) < smallest_error:
                                                smallest_error = round(trial_error/tests,2)
                                                close_age_mult = j
                                                close_bmi_mult = l
                                                close_child_mult = n
                                                close_smoker_mult = p
                                                close_sex_mult = r
    print("Smallest Error: {ser}\nAge Mult: {cam}\nBMI Mult: {cbm}\nChild Mult: {ccm}\nSmoker Mult: {csmoke}\nMale Mult:{csex}"
          .format(ser=smallest_error, cam=close_age_mult, cbm=close_bmi_mult, ccm=close_child_mult, csmoke=close_smoker_mult, csex=close_sex_mult))

Due to my computers limitation, I am keeping the computations here simple. This will only give a general idea of which factors have a positive or negative impact.

In [48]:
poss_age_mult = [i for i in range(200,251,5)]
poss_bmi_mult = [i for i in range(-100,100,1)]
poss_child_mult = [i for i in range(300,501,10)]
poss_smoker_mult = [i for i in range(10000,50001,1000)]
poss_sex_mult = [i for i in range(-700,-499,10)]

In [None]:
test_cases(age, poss_age_mult, bmi, poss_bmi_mult, children, poss_child_mult, smoker, poss_smoker_mult, sex, poss_sex_mult, charges)

#### Conclusion:
1. Age has a positive correlation with cost (the older the person, the more expensive insurance is)
2. BMI
3. Having more children increases the cost of medical insurance
4. Smoking has a large upcharge
5. Medical insurance is more expensive for women than men