In [1]:
medical_charges_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'

In [2]:
from urllib.request import urlretrieve
urlretrieve(medical_charges_url, 'medical.csv')

('medical.csv', <http.client.HTTPMessage at 0x181522f8e10>)

In [3]:
!pip install pandas --quiet


[notice] A new release of pip is available: 23.0.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
medical_df = pd.read_csv('medical.csv')

In [5]:
medical_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [6]:
medical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
medical_df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [8]:
!pip install matplotlib seaborn --quiet


[notice] A new release of pip is available: 23.0.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
medical_df.age.describe()

In [None]:
ages = medical_df['age']

fig, (ax_hist, ax_box) = plt.subplots(
    2, 1,
    figsize=(8, 6),
    sharex=True,
    gridspec_kw={'height_ratios': [4, 1]}
)

# Histogram
ax_hist.hist(ages, bins=47)
ax_hist.set_title('Distribution of Age')
ax_hist.set_ylabel('Count')

# Boxplot
ax_box.boxplot(ages, vert=False)
ax_box.set_xlabel('Age')

plt.tight_layout()
plt.show()

In [None]:

bmi = medical_df['bmi']

fig, (ax_hist, ax_box) = plt.subplots(
    2, 1,
    figsize=(8, 6),
    sharex=True,
    gridspec_kw={'height_ratios': [4, 1]}
)

# Histogram
ax_hist.hist(bmi, bins=30)
ax_hist.set_title('Distribution of BMI (Body Mass Index)')
ax_hist.set_ylabel('Count')

# Boxplot
ax_box.boxplot(bmi, vert=False)
ax_box.set_xlabel('BMI')

plt.tight_layout()
plt.show()

In [None]:

# Split data by smoker status
charges_smoker = medical_df[medical_df['smoker'] == 'yes']['charges']
charges_nonsmoker = medical_df[medical_df['smoker'] == 'no']['charges']

fig, (ax_hist, ax_box) = plt.subplots(
    2, 1,
    figsize=(9, 6),
    sharex=True,
    gridspec_kw={'height_ratios': [4, 1]}
)

# Histogram (overlayed)
ax_hist.hist(charges_smoker, bins=40, alpha=0.7, label='Smoker')
ax_hist.hist(charges_nonsmoker, bins=40, alpha=0.7, label='Non-Smoker')

ax_hist.set_title('Annual Medical Charges')
ax_hist.set_ylabel('Count')
ax_hist.legend()

# Boxplot
ax_box.boxplot(
    [charges_nonsmoker, charges_smoker],
    vert=False,
    labels=['Non-Smoker', 'Smoker']
)
ax_box.set_xlabel('Charges')

plt.tight_layout()
plt.show()

In [None]:
medical_df.smoker.value_counts()

In [None]:
# Split data by smoker status
smoker_yes = medical_df[medical_df['smoker'] == 'yes']
smoker_no = medical_df[medical_df['smoker'] == 'no']

plt.figure(figsize=(8, 6))

plt.scatter(
    smoker_no['age'],
    smoker_no['charges'],
    alpha=0.8,
    s=20,
    label='Non-Smoker'
)

plt.scatter(
    smoker_yes['age'],
    smoker_yes['charges'],
    alpha=0.8,
    s=20,
    label='Smoker'
)

plt.title('Age vs. Charges')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:

# Split data by smoker status
smoker_yes = medical_df[medical_df['smoker'] == 'yes']
smoker_no = medical_df[medical_df['smoker'] == 'no']

plt.figure(figsize=(8, 6))

plt.scatter(
    smoker_no['bmi'],
    smoker_no['charges'],
    alpha=0.8,
    s=20,
    label='Non-Smoker'
)

plt.scatter(
    smoker_yes['bmi'],
    smoker_yes['charges'],
    alpha=0.8,
    s=20,
    label='Smoker'
)

plt.title('BMI vs. Charges')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
medical_df.charges.corr(medical_df.age)
medical_df.charges.corr(medical_df.bmi)

In [None]:
smoker_values = {'no': 0, 'yes': 1}
smoker_numeric = medical_df.smoker.map(smoker_values)
medical_df.charges.corr(smoker_numeric)

In [None]:
# 1️⃣ Charges vs Children
plt.figure(figsize=(8, 5))
medical_df.boxplot(column='charges', by='children', grid=False)
plt.title('Medical Charges by Number of Children')
plt.suptitle('')
plt.xlabel('Number of Children')
plt.ylabel('Charges')
plt.tight_layout()
plt.show()

# 2️⃣ Charges vs Sex
plt.figure(figsize=(6, 5))
medical_df.boxplot(column='charges', by='sex', grid=False)
plt.title('Medical Charges by Sex')
plt.suptitle('')
plt.xlabel('Sex')
plt.ylabel('Charges')
plt.tight_layout()
plt.show()

# 3️⃣ Charges vs Region
plt.figure(figsize=(8, 5))
medical_df.boxplot(column='charges', by='region', grid=False)
plt.title('Medical Charges by Region')
plt.suptitle('')
plt.xlabel('Region')
plt.ylabel('Charges')
plt.tight_layout()
plt.show()

# 4️⃣ Charges vs Smoker
plt.figure(figsize=(6, 5))
medical_df.boxplot(column='charges', by='smoker', grid=False)
plt.title('Medical Charges by Smoker Status')
plt.suptitle('')
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.tight_layout()
plt.show()

In [None]:
medical_df.charges.corr(medical_df.age)

In [None]:
medical_df.charges.corr(medical_df.bmi)

In [None]:
smoker_values = {'no': 0, 'yes': 1}
smoker_numeric = medical_df.smoker.map(smoker_values)
medical_df.charges.corr(smoker_numeric)

In [None]:
medical_df.select_dtypes(include='number').corr()

In [None]:
sns.heatmap(medical_df.select_dtypes(include='number').corr(), cmap='Reds', annot=True)
plt.title('Correlation Matrix');