# Initial conditions:

In [31]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()



In [32]:
api.dataset_download_file(dataset="teertha/ushealthinsurancedataset", file_name="insurance.csv")
main_data = pd.read_csv("insurance.csv")


Dataset URL: https://www.kaggle.com/datasets/teertha/ushealthinsurancedataset


In [33]:
main_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# Cleaning the data:

In [34]:
main_data = main_data.round(decimals=2)

In [35]:
main_data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [36]:
print(main_data.duplicated().sum())
main_data[main_data.duplicated()]

1


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.56


In [37]:
main_data.drop_duplicates(inplace = True)
print(main_data.duplicated().sum())

0


# Pre-analysis data hypotheses:

- older people would pay a higher premium
- people with a higher bmi would pay a higher premium
- region with the highest average charges would be the most wealthy on average

# Cursory view of data trends:

In [44]:
main_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663628,1.095737,13279.121503
std,14.044333,6.100233,1.205571,12110.359677
min,18.0,15.96,0.0,1121.87
25%,27.0,26.29,0.0,4746.34
50%,39.0,30.4,1.0,9386.16
75%,51.0,34.7,2.0,16657.72
max,64.0,53.13,5.0,63770.43


In [45]:
main_data.describe(include="object")

Unnamed: 0,sex,smoker,region
count,1337,1337,1337
unique,2,2,4
top,male,no,southeast
freq,675,1063,364


In [53]:
print(main_data["sex"].value_counts())
print(main_data["smoker"].value_counts())
print(main_data["region"].value_counts())

sex
male      675
female    662
Name: count, dtype: int64
smoker
no     1063
yes     274
Name: count, dtype: int64
region
southeast    364
southwest    325
northwest    324
northeast    324
Name: count, dtype: int64


most and least expensive insurance charges:

In [38]:
main_data.sort_values(by='charges', ascending=False).head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
543,54,female,47.41,0,yes,southeast,63770.43
1300,45,male,30.36,0,yes,southeast,62592.87
1230,52,male,34.48,3,yes,northwest,60021.4
577,31,female,38.1,1,yes,northeast,58571.07
819,33,female,35.53,0,yes,northwest,55135.4
1146,60,male,32.8,0,yes,southwest,52590.83
34,28,male,36.4,1,yes,southwest,51194.56
1241,64,male,36.96,2,yes,southeast,49577.66
1062,59,male,41.14,1,yes,southeast,48970.25
488,44,female,38.06,0,yes,southeast,48885.14


In [39]:
main_data.sort_values(by='charges', ascending=False).tail(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1317,18,male,53.13,0,no,southeast,1163.46
442,18,male,43.01,0,no,southeast,1149.4
781,18,male,41.14,0,no,southeast,1146.8
866,18,male,37.29,0,no,southeast,1141.45
194,18,male,34.43,0,no,southeast,1137.47
22,18,male,34.1,0,no,southeast,1137.01
663,18,male,33.66,0,no,southeast,1136.4
1244,18,male,33.33,0,no,southeast,1135.94
808,18,male,30.14,0,no,southeast,1131.51
940,18,male,23.21,0,no,southeast,1121.87


highest and lowest BMIs:

In [40]:
main_data.sort_values(by='bmi', ascending=False).head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1317,18,male,53.13,0,no,southeast,1163.46
1047,22,male,52.58,1,yes,southeast,44501.4
847,23,male,50.38,1,no,southeast,2438.06
116,58,male,49.06,0,no,southeast,11381.33
286,46,female,48.07,2,no,northeast,9432.93
1088,52,male,47.74,1,no,southeast,9748.91
860,37,female,47.6,2,yes,southwest,46113.51
401,47,male,47.52,1,no,southeast,8083.92
543,54,female,47.41,0,yes,southeast,63770.43
438,52,female,46.75,5,no,southeast,12592.53


In [43]:
main_data.sort_values(by='bmi', ascending=False).tail(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
410,19,male,17.48,0,no,northwest,1621.34
680,21,female,17.4,1,no,southwest,2585.27
28,23,male,17.39,1,no,northwest,2775.19
1029,37,female,17.29,2,no,northeast,6877.98
250,18,male,17.29,2,yes,northeast,12829.46
1286,28,female,17.29,0,no,northeast,3732.63
412,26,female,17.2,2,yes,northeast,14455.64
428,21,female,16.82,1,no,northeast,3167.46
1226,38,male,16.82,2,no,northeast,6640.54
172,18,male,15.96,0,no,northeast,1694.8


# Groupby analysis:

In [54]:
main_data.groupby("sex")["charges"].mean()

sex
female    12569.578716
male      13974.999022
Name: charges, dtype: float64

In [60]:
main_data.groupby('region')['charges'].agg(["mean", "median"])

Unnamed: 0_level_0,mean,median
region,Unnamed: 1_level_1,Unnamed: 2_level_1
northeast,13406.384599,10057.655
northwest,12450.840648,8976.98
southeast,14735.411484,9294.13
southwest,12346.937508,8798.59


In [63]:
main_data['age_group'] = pd.cut(main_data['age'], bins=[18, 30, 40, 50, 60, 70], labels=['18-29', '30-39', '40-49', '50-59', '60-69'])
main_data.groupby('age_group')['charges'].mean()

  main_data.groupby('age_group')['charges'].mean()


age_group
18-29     9844.718102
30-39    11639.308599
40-49    14782.042989
50-59    17062.292792
60-69    21063.163297
Name: charges, dtype: float64

In [75]:
main_data.groupby("smoker")["charges"].mean()

smoker
no      8440.660310
yes    32050.231898
Name: charges, dtype: float64

In [73]:
main_data["bmi_group"] = pd.cut(main_data["bmi"], bins=[15.96, 18.5, 24.9, 29.9, 34.9, 39.9, 53.13] , labels=['Underweight (<18.5)', 'Normal (18.5-24.9)', 'Overweight (25-29.9)', 'Obese Class I (30-34.9)', 'Obese Class II (35-39.9)', 'Obese Class III (>=40)']
)
main_data.groupby("bmi_group")["charges"].mean()

  main_data.groupby("bmi_group")["charges"].mean()


bmi_group
Underweight (<18.5)          9005.762000
Normal (18.5-24.9)          10404.900045
Overweight (25-29.9)        11006.809842
Obese Class I (30-34.9)     14249.379848
Obese Class II (35-39.9)    17245.410487
Obese Class III (>=40)      16667.608280
Name: charges, dtype: float64

In [74]:
main_data.groupby(["bmi_group", "smoker"])["charges"].mean()

  main_data.groupby(["bmi_group", "smoker"])["charges"].mean()


bmi_group                 smoker
Underweight (<18.5)       no         5737.740667
                          yes       18809.826000
Normal (18.5-24.9)        no         7616.209064
                          yes       19942.223200
Overweight (25-29.9)      no         8348.367825
                          yes       22379.034028
Obese Class I (30-34.9)   no         8514.353137
                          yes       39204.496081
Obese Class II (35-39.9)  no         9621.300057
                          yes       42756.856923
Obese Class III (>=40)    no         8267.556111
                          yes       45467.787143
Name: charges, dtype: float64