# U.S. Medical Insurance Costs

For this project, we will analyze data from a medical insurance costs .csv file using python. The goal is to better know the patients in the file and gain insight into the potential use of the data.

In [4]:
#import library
import csv

The document insurance.csv contains the following information about patients:
- Age
- Sex
- Bmi
- Children
- Smoker
- Region
- Charges

To store this informations, we will create lists to hold each column of data.

In [5]:
#info we have about each patient
age = []
sex = []
bmi = []
children = []
smoker = []
region = []
charges = []

In [6]:
#import dataset and create list with function
def create_list_from_data(lst, csv_file, column_name):
    with open("insurance.csv", newline = "") as insurance:
        insurance_dataset = csv.DictReader(insurance)
        for row in insurance_dataset:
            lst.append(row[column_name])
        return lst

In [7]:
create_list_from_data(age, "insurance.csv", "age")
create_list_from_data(sex, "insurance.csv", "sex")
create_list_from_data(bmi, "insurance.csv", "bmi")
create_list_from_data(children, "insurance.csv", "children")
create_list_from_data(smoker, "insurance.csv", "smoker")
create_list_from_data(region, "insurance.csv", "region")
create_list_from_data(charges, "insurance.csv", "charges")

['16884.924',
 '1725.5523',
 '4449.462',
 '21984.47061',
 '3866.8552',
 '3756.6216',
 '8240.5896',
 '7281.5056',
 '6406.4107',
 '28923.13692',
 '2721.3208',
 '27808.7251',
 '1826.843',
 '11090.7178',
 '39611.7577',
 '1837.237',
 '10797.3362',
 '2395.17155',
 '10602.385',
 '36837.467',
 '13228.84695',
 '4149.736',
 '1137.011',
 '37701.8768',
 '6203.90175',
 '14001.1338',
 '14451.83515',
 '12268.63225',
 '2775.19215',
 '38711',
 '35585.576',
 '2198.18985',
 '4687.797',
 '13770.0979',
 '51194.55914',
 '1625.43375',
 '15612.19335',
 '2302.3',
 '39774.2763',
 '48173.361',
 '3046.062',
 '4949.7587',
 '6272.4772',
 '6313.759',
 '6079.6715',
 '20630.28351',
 '3393.35635',
 '3556.9223',
 '12629.8967',
 '38709.176',
 '2211.13075',
 '3579.8287',
 '23568.272',
 '37742.5757',
 '8059.6791',
 '47496.49445',
 '13607.36875',
 '34303.1672',
 '23244.7902',
 '5989.52365',
 '8606.2174',
 '4504.6624',
 '30166.61817',
 '4133.64165',
 '14711.7438',
 '1743.214',
 '14235.072',
 '6389.37785',
 '5920.1041',
 '176

To analyze the data, we will create a class called Patient. We will focus on analysis in two things:
- General information about our patients
    - Average age
    - Location
    - Nº of females vs. Nº of males
    - Nª of smokers vs. Nº of non-smokers
    - Num of children per patient
- Average costs depending on:
    - Smoking
    - Females vs Males
    - How many patients are above or below average


In [297]:
#defining class and methods

class Patient:
    
    def __init__(self, age, sex, bmi, children, smoker, region, charges):
        self.age = age
        self.sex = sex
        self.bmi = bmi
        self.children = children
        self.smoker = smoker
        self.region = region
        self.charges = charges
        
    def patient_info(self):
        patient_info = {"Nº of patiens": len(self.age), 
                        "Age": self.age, 
                        "Sex": self.sex, 
                        "Bmi": self.bmi, 
                        "Number of children": self.children, 
                        "Smoker": self.smoker, 
                        "Region": self.region, 
                        "Charges": self.charges}
        return patient_info
          
    
    def average_age(self):
        ages_float = list(map(float, self.age))
        return "Average patients age: " + str(round(sum(ages_float) / len(ages_float), 2))
            
    def location(self):
        location = dict()
        for area in region:
            if area not in location:
                location[area] = 1
            if area in location:
                location[area] += 1
        return location
    
    def female_vs_male(self):
        females = 0
        males = 0
        for person in sex:
            if person == "female":
                females += 1
            else:
                males += 1
        return "Nº of females: " + str(females) + " and Nº of males: " + str(males)
    
    def smokers(self):
        smokers = {"Smokers": 0, "Non-smokers": 0}
        for patient in smoker:
            if patient == "yes":
                smokers["Smokers"] += 1
            else:
                smokers["Non-smokers"] += 1
        return smokers
    
    def num_of_children(self):
        num_of_children = dict()
        for num in children:
            if num not in num_of_children:
                num_of_children[num] = 1
            if num in num_of_children:
                num_of_children[num] += 1
        return dict(sorted(num_of_children.items())) 
    
    def average_insurance_cost(self):
        charges_float = list(map(float, self.charges))
        return "Average insurance cost per patient: {}".format(round(sum(charges_float) / len(charges_float), 2))
    
    def average_cost_if_smoker(self):
        charge_if_smoker = []
        for i in range(0, len(charges)):
            for i in range(0, len(smoker)):
                if smoker[i] == "yes":
                    charge_if_smoker.append(float(charges[i]))
        average_cost_if_smoker = round(sum(charge_if_smoker) / len(charge_if_smoker), 2)
        return "Average cost if patients are smokers: " + str(average_cost_if_smoker) 
    
    def average_cost_sex(self):
        charge_if_female = []
        charge_if_male = []
        for i in range(0, len(sex)):
            for i in range(0, len(charges)):
                if sex[i] == "female":
                    charge_if_female.append(float(charges[i]))
                else:
                    charge_if_male.append(float(charges[i]))
        return "Average cost if patients are female: " + str(round(sum(charge_if_female) / len(charge_if_female), 2))\
               + " and average cost if patients are male: " + str(round(sum(charge_if_male) / len(charge_if_male), 2))
        
    def above_below_average(self):
        charges_float = list(map(float, self.charges))
        average = {"Above": 0, "Below": 0, "Equal": 0}
        for cost in charges_float:
            if cost > sum(charges_float) / len(charges_float):
                average["Above"] += 1
            elif cost < sum(charges_float) / len(charges_float):
                average["Below"] += 1
            else:
                average["Equal"] += 1
        return average

The next step is to create an instance of the class so we can use each method we have created for the analysis.

First, we created a dictionary to organize our dataset.

In [298]:
patient = Patient(age, sex, bmi, children, smoker, region, charges)
print(patient.patient_info())

{'Nº of patiens': 1338, 'Age': ['19', '18', '28', '33', '32', '31', '46', '37', '37', '60', '25', '62', '23', '56', '27', '19', '52', '23', '56', '30', '60', '30', '18', '34', '37', '59', '63', '55', '23', '31', '22', '18', '19', '63', '28', '19', '62', '26', '35', '60', '24', '31', '41', '37', '38', '55', '18', '28', '60', '36', '18', '21', '48', '36', '40', '58', '58', '18', '53', '34', '43', '25', '64', '28', '20', '19', '61', '40', '40', '28', '27', '31', '53', '58', '44', '57', '29', '21', '22', '41', '31', '45', '22', '48', '37', '45', '57', '56', '46', '55', '21', '53', '59', '35', '64', '28', '54', '55', '56', '38', '41', '30', '18', '61', '34', '20', '19', '26', '29', '63', '54', '55', '37', '21', '52', '60', '58', '29', '49', '37', '44', '18', '20', '44', '47', '26', '19', '52', '32', '38', '59', '61', '53', '19', '20', '22', '19', '22', '54', '22', '34', '26', '34', '29', '30', '29', '46', '51', '53', '19', '35', '48', '32', '42', '40', '44', '48', '18', '30', '50', '42', '1

Then, we have procced to analyse the information we have about the patients.

In [299]:
print(patient.average_age())
print(patient.location())
print(patient.female_vs_male())
print(patient.smokers())
print(patient.num_of_children())

Average patients age: 39.21
{'southwest': 326, 'southeast': 365, 'northwest': 326, 'northeast': 325}
Nº of females: 662 and Nº of males: 676
{'Smokers': 274, 'Non-smokers': 1064}
{'0': 575, '1': 325, '2': 241, '3': 158, '4': 26, '5': 19}


We see that we have more or less the same number of:
- People from every location
- Females vs. Males

Also, as we can see, most of our patients are non-smokers. Regarding number of children, the number of patients descend as the number of children increases. This tendency is present in most population pyramids.

Finally, we have calculated the general average insurance cost per patient, the average cost taking into account different indicators and how many patients are above, below or equal to the average cost.

In [300]:
print(patient.average_insurance_cost())
print(patient.average_cost_if_smoker())
print(patient.average_cost_sex())
print(patient.above_below_average())

Average insurance cost per patient: 13270.42
Average cost if patients are smokers: 32050.23
Average cost if patients are female: 12569.58 and average cost if patients are male: 13956.75
{'Above': 420, 'Below': 918, 'Equal': 0}


The results show us that if a person is a smokerer, the average cost in their insurance is higher than the average insurance cost for patients in general. Also, in further analysis, we should check why females have average insurance cost than males.