# U.S. Medical Insurance Costs

## Step 1: 
### Import insurance.csv into Python file and inspect the contents.

In [76]:
import csv
def csv_read (key):
    lst = []
    with open('insurance.csv') as insurance_csv:
        csv_reader = csv.DictReader(insurance_csv)
        for row in csv_reader:
            lst.append(row.get(key))
    return lst
        

## Step 2: 
### Save dataset via Python variables

In [81]:
ages = csv_read('age')
sex = csv_read('sex')
bmi = csv_read('bmi')
region = csv_read('region')
smoker = csv_read('smoker')
charges = csv_read('charges')
children = csv_read('children')
#print(ages)

## Step 3: 
### Look for biases in the dataset and explore variables

#### Qualitative Variables

In [78]:
sample_size = len(ages)

# Sex Distribution
male_count = 0
female_count = 0
for item in sex:
    if item == 'male':
        male_count += 1
    elif item == 'female':
        female_count += 1
    
print('The ratio between males and females in this dataset is of ' + str(round(male_count / female_count,1)) + '. This sample is comprised of ' + str(male_count) + ' males and ' + str(female_count) + ' females, which form ' + str(round((male_count / sample_size)*100,1)) + '% and ' + str(round((female_count/sample_size)*100,1)) + "% of the population respectively")

# Smoking Distribution
smoker_count = 0
non_smoker_count = 0
for item in smoker:
    if item == 'yes':
        smoker_count += 1
    elif item == 'no':
        non_smoker_count += 1

print('The ratio between smokers and non_smokers in this dataset is of ' + str(round(smoker_count / non_smoker_count,1)) + '. This sample is comprised of ' + str(smoker_count) + ' smokers and ' + str(non_smoker_count) + ' non_smokers, which form ' + str(round((smoker_count / sample_size)*100,1)) + '% and ' + str(round((non_smoker_count/sample_size)*100,1)) + "% of the population respectively")

sex_dict={'male':[], 'female':[]}
for item in range(len(sex)):
    if sex[item] == 'male':
        sex_dict['male'].append(smoker[item])
    elif sex[item] == 'female':
        sex_dict['female'].append(smoker[item])
#print(sex_dict)

male_smoke = sex_dict['male'].count('yes')
female_smoke = sex_dict['female'].count('yes')
print('The number of male smokers in the dataset is ' + str(round(male_smoke,1)) + ' making up ' + str(round((male_smoke / male_count * 100),1)) + '% of the male population. While the number of female smokers in the dataset is ' + (str(round(female_smoke,1))) + ' making up ' + str(round((female_smoke/female_count)* 100,1)) + '% of the female population.')

# Region Distribution

southwest_count = 0
southeast_count = 0
northwest_count = 0
northeast_count = 0
for item in region:
    if item == 'southwest':
        southwest_count += 1
    elif item == 'southeast':
        southeast_count += 1
    elif item == 'northwest':
        northwest_count +=1
    elif item == 'northeast':
        northeast_count += 1

print('Out of the total population, ' + str(southwest_count) + ' people are from the Southwest, representing ' + str(round((southwest_count/sample_size)*100,1)) + '%.')
print('Out of the total population, ' + str(southeast_count) + ' people are from the Southeast, representing ' + str(round((southeast_count/sample_size)*100,1)) + '%.')
print('Out of the total population, ' + str(northwest_count) + ' people are from the Northwest, representing ' + str(round((northwest_count/sample_size)*100,1)) + '%.')
print('Out of the total population, ' + str(northeast_count) + ' people are from the Northeast, representing ' + str(round((northeast_count/sample_size)*100,1)) + '%.')


The ratio between males and females in this dataset is of 1.0. This sample is comprised of 676 males and 662 females, which form 50.5% and 49.5% of the population respectively
The ratio between smokers and non_smokers in this dataset is of 0.3. This sample is comprised of 274 smokers and 1064 non_smokers, which form 20.5% and 79.5% of the population respectively
The number of male smokers in the dataset is 159 making up 23.5% of the male population. While the number of female smokers in the dataset is 115 making up 17.4% of the female population.
Out of the total population, 325 people are from the Southwest, representing 24.3%.
Out of the total population, 364 people are from the Southeast, representing 27.2%.
Out of the total population, 325 people are from the Northwest, representing 24.3%.
Out of the total population, 324 people are from the Northeast, representing 24.2%.


### Quantitative Variables

In [112]:
# Age Analysis

def average_age(list):
    sum = 0
    length = 0
    for item in list:
        sum += int(item)
        length += 1
    return sum/length

print(round(average_age(ages),1))

age_16_25_count = 0
age_26_35_count = 0
age_36_45_count = 0
age_46_55_count = 0
age_56_65_count = 0
for item in ages:
    if item >= '16' and item <='25':
        age_16_25_count += 1
    elif item >= '26' and item <= '35':
        age_26_35_count += 1
    elif item >= '36' and item <= '45':
        age_36_45_count += 1
    elif item >= '46' and item <= '55':
        age_46_55_count += 1
    elif item >= '56' and item <= '65':
        age_56_65_count += 1

print ('Within the sample, there are ' + str(age_16_25_count) + ' people between the ages of 16 and 25 ('+ str(round((age_16_25_count/sample_size)*100,1))+ '% of the population), ' + str(age_26_35_count)+ ' people between the ages of 26 and 35 ('+ str(round((age_26_35_count/sample_size)*100,1))+ '% of the population), ' + str(age_36_45_count) + ' people between the ages of 36 and 45 (' + str(round((age_36_45_count/sample_size)*100,1))+ '% of the population), ' + str(age_46_55_count) + ' people between the ages of 46 and 55 ('+ str(round((age_46_55_count/sample_size)*100,1))+ '% of the population) and ' + str(age_56_65_count) + ' people between the ages of 56 and 65 ('+ str(round((age_56_65_count/sample_size)*100,1))+ '% of the population)')

# BMI Analysis

underweight = 0
normal = 0
overweight = 0
obese = 0
for item in bmi:
    if item <'18.5':
        underweight += 1
    elif item >= '18.5' and item <= '24.9':
        normal += 1
    elif item >= '25' and item <= '29.9':
        overweight += 1
    elif item >= '30':
        obese += 1

print('There are ' + str(underweight) + ' people classified as "underweight", comprising ' + str(round((underweight/sample_size)* 100,1))+ '% of the total population')
print('There are ' + str(normal) + ' people classified as "normal", comprising ' + str(round((normal/sample_size)* 100,1))+ '% of the total population')
print('There are ' + str(overweight) + ' people classified as "overweight", comprising ' + str(round((overweight/sample_size)* 100,1))+ '% of the total population')
print('There are ' + str(obese) + ' people classified as "obese", comprising ' + str(round((obese/sample_size)* 100,1))+ '% of the total population')





39.2
Within the sample, there are 306 people between the ages of 16 and 25 (22.9% of the population), 268 people between the ages of 26 and 35 (20.0% of the population), 264 people between the ages of 36 and 45 (19.7% of the population), 284 people between the ages of 46 and 55 (21.2% of the population) and 216 people between the ages of 56 and 65 (16.1% of the population)
There are 20 people classified as "underweight", comprising 1.5% of the total population
There are 222 people classified as "normal", comprising 16.6% of the total population
There are 377 people classified as "overweight", comprising 28.2% of the total population
There are 707 people classified as "obese", comprising 52.8% of the total population
