In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [7]:
data = pd.read_csv("insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# **A. Analisa Descriptive Statistic**

**1. Berapa rata-rata umur responden dari dataset ini?**

In [8]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [11]:
data.shape

(1338, 7)

**2. Berapa rata rata nilai BMI dari yang merokok?**

In [12]:
#rata2 BMI dari perokok
data.groupby(['smoker']).agg({'bmi':['mean']}).round(1)

Unnamed: 0_level_0,bmi
Unnamed: 0_level_1,mean
smoker,Unnamed: 1_level_2
no,30.7
yes,30.7


In [13]:
#rata2 BMI dari perokok vs non perokok
data.groupby(['smoker','sex']).agg({'bmi':['mean']}).round(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,bmi
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
smoker,sex,Unnamed: 2_level_2
no,female,30.5
no,male,30.8
yes,female,29.6
yes,male,31.5


**3. Apakah rata rata umur perempuan dan laki-laki yang merokok sama?**

In [14]:
#rata2 umur perokok berdasarkan gender
data.groupby(['sex', 'smoker']).agg({'age':['mean']}).round()

Unnamed: 0_level_0,Unnamed: 1_level_0,age
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
sex,smoker,Unnamed: 2_level_2
female,no,40.0
female,yes,39.0
male,no,39.0
male,yes,38.0


**4. Berapa rata-rata tagihan dari perokok?**

In [15]:
#rata2 charges perokok
data.groupby(['smoker']).agg({'charges':['mean']}).round()

Unnamed: 0_level_0,charges
Unnamed: 0_level_1,mean
smoker,Unnamed: 1_level_2
no,8434.0
yes,32050.0


**5. Mana yang lebih tinggi, rata rata tagihan kesehatan perokok yang BMI nya diatas 25
atau non perokok yang BMI nya diatas 25**

In [16]:
#new column with BMI category
bmi_condition = [data['bmi'] < 18.5, data['bmi'] < 25, data['bmi'] < 30, data['bmi'] >= 30]
category = ["underweight", "healthy", "overweight", "obese"]

data['kategori'] = np.select(bmi_condition, category)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,kategori
0,19,female,27.9,0,yes,southwest,16884.924,overweight
1,18,male,33.77,1,no,southeast,1725.5523,obese
2,28,male,33.0,3,no,southeast,4449.462,obese
3,33,male,22.705,0,no,northwest,21984.47061,healthy
4,32,male,28.88,0,no,northwest,3866.8552,overweight


In [17]:
# rata2 tagihan perokok yang overweight
data[data.kategori =='overweight'].groupby(['smoker','kategori']).agg({'charges':['mean']}).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,charges
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
smoker,kategori,Unnamed: 2_level_2
no,overweight,8257.96
yes,overweight,22495.87


# **B. Analisa Variabel Diskrit**

**1. **

In [18]:
#rata2 tagihan laki laki vs perempuan
data.groupby(['sex']).agg({'charges':['mean']}).round()

Unnamed: 0_level_0,charges
Unnamed: 0_level_1,mean
sex,Unnamed: 1_level_2
female,12570.0
male,13957.0


In [19]:
#distribusi peluang banyak data dari tiap region
region_total = data['region'].count()
region = data.groupby(['region']).agg({'region':'count'})

region['percentage'] = 100 * region['region'] / region_total #persentase
region = region.round()


region

Unnamed: 0_level_0,region,percentage
region,Unnamed: 1_level_1,Unnamed: 2_level_1
northeast,324,24.0
northwest,325,24.0
southeast,364,27.0
southwest,325,24.0


In [20]:
# Proporsi perokok dan non-perokok
smoker = data.groupby(['smoker']).agg({'smoker':['count']}).round(1)
smoker['percentage'] = 100 * smoker['smoker'] / 1338
smoker = smoker.round(1)

smoker

Unnamed: 0_level_0,smoker,percentage
Unnamed: 0_level_1,count,Unnamed: 2_level_1
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2
no,1064,79.5
yes,274,20.5


In [21]:
#proporsi perokok vs non perokok berdasarkan region
smoker_region = data.groupby(['region','smoker']).agg(smoker_count=('smoker', 'size')
        )

smoker_region['percentage'] = smoker_region.groupby('region')['smoker_count'].transform(lambda x: x/x.sum()*100)

smoker_region.round(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,smoker_count,percentage
region,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
northeast,no,257,79.3
northeast,yes,67,20.7
northwest,no,267,82.2
northwest,yes,58,17.8
southeast,no,273,75.0
southeast,yes,91,25.0
southwest,no,267,82.2
southwest,yes,58,17.8


In [23]:
# Proporsi perokok laki2 vs perempuan
smoker_gender = data[data.smoker=='yes'].groupby(['sex'])['smoker'].agg('count').round(1)
smoker_gender = smoker_gender.to_frame()
smoker_gender['percentage'] = 100 * smoker_gender['smoker'] / 274
smoker_gender = smoker_gender.round(1)

smoker_gender

Unnamed: 0_level_0,smoker,percentage
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,115,42.0
male,159,58.0


# **C. Analisa Variabel Kontinu**

**1. Peluang seseorang mendapatkan tagihan diatas $16.7k dan BMI orang tersebut diatas 25**

In [24]:
#filter jumlah BMI > 25 dan punya tagihan > 16.7k
condition_1 = data[(data['bmi']>25) & (data['charges']> 16700)]
condition_1

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,kategori
0,19,female,27.900,0,yes,southwest,16884.92400,overweight
9,60,female,25.840,0,no,northwest,28923.13692,overweight
11,62,female,26.290,0,yes,southeast,27808.72510,overweight
14,27,male,42.130,0,yes,southeast,39611.75770,obese
19,30,male,35.300,0,yes,southwest,36837.46700,obese
...,...,...,...,...,...,...,...,...
1313,19,female,34.700,2,yes,southwest,36397.57600,obese
1318,35,male,39.710,4,no,northeast,19496.71917,obese
1321,62,male,26.695,0,yes,northeast,28101.33305,overweight
1323,42,female,40.370,2,yes,southeast,43896.37630,obese


In [25]:
p_irisan_1 = 283
p_bmi_over_25 = len(data[data['bmi']>25])

#peluang tagihan seseorang > 16.7k diketahui BMI > 25
p_over16_given_over25 = p_irisan_1 / p_bmi_over_25 *100
print(f'Peluang mendapatkan tagihan > 16.7k diketahui BMI > 25: {p_over16_given_over25:.2f}%')

Peluang mendapatkan tagihan > 16.7k diketahui BMI > 25: 25.94%


**2. Peluang seseorang mendapatkan tagihan diatas $16.7k dan BMI orang tersebut dibawah 25**

In [26]:
#filter jumlah BMI < 25 dan punya tagihan > 16.7k
condition_2 = data[(data['bmi']<25) & (data['charges']>16700)]
condition_2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,kategori
3,33,male,22.705,0,no,northwest,21984.47061,healthy
58,53,female,22.88,1,yes,southeast,23244.7902,healthy
62,64,male,24.7,1,no,northwest,30166.61817,healthy
69,28,male,23.98,3,yes,southeast,17663.1442,healthy
85,45,male,22.895,2,yes,northwest,21098.55405,healthy


In [27]:
p_irisan_2 = len(data[(data['bmi']<25) & (data['charges']>16700)])
p_bmi_under_25 = len(data[data['bmi']<25])

#peluang tagihan seseorang > 16.7k diketahui BMI < 25
p_over16_under25 = p_irisan_2 / p_bmi_under_25 *100
print(f'Peluang mendapatkan tagihan > 16.7k diketahui BMI < 25: {p_over16_under25:.2f}%')

Peluang mendapatkan tagihan > 16.7k diketahui BMI < 25: 20.82%


**3. Peluang seseorang mendapatkan tagihan diatas $16.7k, Perokok, dan BMI diatas 25**

In [28]:
#filter perokok dengan BMI > 25 dan punya tagihan > 16.7k
condition_3 = data[(data['bmi']>25) & (data['charges']> 16700) & (data['smoker']=='yes')]
condition_3

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,kategori
0,19,female,27.900,0,yes,southwest,16884.92400,overweight
11,62,female,26.290,0,yes,southeast,27808.72510,overweight
14,27,male,42.130,0,yes,southeast,39611.75770,obese
19,30,male,35.300,0,yes,southwest,36837.46700,obese
23,34,female,31.920,1,yes,northeast,37701.87680,obese
...,...,...,...,...,...,...,...,...
1308,25,female,30.200,0,yes,southwest,33900.65300,obese
1313,19,female,34.700,2,yes,southwest,36397.57600,obese
1321,62,male,26.695,0,yes,northeast,28101.33305,overweight
1323,42,female,40.370,2,yes,southeast,43896.37630,obese


In [29]:
#filter perokok dengan BMI > 25
condition_4 = data[(data['bmi']>25) & (data['smoker']=='yes')]
condition_4

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,kategori
0,19,female,27.900,0,yes,southwest,16884.92400,overweight
11,62,female,26.290,0,yes,southeast,27808.72510,overweight
14,27,male,42.130,0,yes,southeast,39611.75770,obese
19,30,male,35.300,0,yes,southwest,36837.46700,obese
23,34,female,31.920,1,yes,northeast,37701.87680,obese
...,...,...,...,...,...,...,...,...
1308,25,female,30.200,0,yes,southwest,33900.65300,obese
1313,19,female,34.700,2,yes,southwest,36397.57600,obese
1321,62,male,26.695,0,yes,northeast,28101.33305,overweight
1323,42,female,40.370,2,yes,southeast,43896.37630,obese


In [30]:
p_smoker_over25_over16 = 215
p_smoker_over25 = 219

#peluang tagihan seseorang > 16.7k, diketahui BMI < 25 dan perokok
p_over16_given_over25_smoker = p_smoker_over25_over16 / p_smoker_over25 *100
print(f'Peluang mendapatkan tagihan > 16.7k, diketahui BMI > 25 dan juga perokok: {p_over16_given_over25_smoker:.2f}%')

Peluang mendapatkan tagihan > 16.7k, diketahui BMI > 25 dan juga perokok: 98.17%


**4. Peluang tagihan > 16.7k, non-perokok dan BMInya dibawah 25**

In [32]:
#filter non-perokok dengan BMI > 25 dan punya tagihan > 16.7k
condition_5 = data[(data['bmi']>25) & (data['charges']> 16700) & (data['smoker']=='no')]
condition_5

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,kategori
9,60,female,25.840,0,no,northwest,28923.13692,overweight
45,55,male,37.300,0,no,southwest,20630.28351,obese
102,18,female,30.115,0,no,northeast,21344.84670,obese
115,60,male,28.595,0,no,northeast,30259.99556,overweight
138,54,female,31.900,3,no,southeast,27322.73386,obese
...,...,...,...,...,...,...,...,...
1195,19,female,27.930,3,no,northwest,18838.70366,overweight
1206,59,female,34.800,2,no,southwest,36910.60803,obese
1211,39,male,34.100,2,no,southeast,23563.01618,obese
1258,55,male,37.715,3,no,northwest,30063.58055,obese


In [33]:
#filter non-perokok dengan BMI > 25
condition_6 = data[(data['bmi']>25) & (data['smoker']=='no')]
condition_6

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,kategori
1,18,male,33.77,1,no,southeast,1725.5523,obese
2,28,male,33.00,3,no,southeast,4449.4620,obese
4,32,male,28.88,0,no,northwest,3866.8552,overweight
5,31,female,25.74,0,no,southeast,3756.6216,overweight
6,46,female,33.44,1,no,southeast,8240.5896,obese
...,...,...,...,...,...,...,...,...
1332,52,female,44.70,3,no,southwest,11411.6850,obese
1333,50,male,30.97,3,no,northwest,10600.5483,obese
1334,18,female,31.92,0,no,northeast,2205.9808,obese
1335,18,female,36.85,0,no,southeast,1629.8335,obese


In [34]:
p_nonSmoker_over25_over16 = 68
p_nonSmoker_over25 = 872

#peluang tagihan seseorang > 16.7k, diketahui BMI < 25 dan non-perokok
p_over16_given_over25_nonSmoker = p_nonSmoker_over25_over16 / p_nonSmoker_over25 *100
print(f'Peluang mendapatkan tagihan > 16.7k, diketahui BMI > 25 dan juga non-perokok: {p_over16_given_over25_nonSmoker:.2f}%')

Peluang mendapatkan tagihan > 16.7k, diketahui BMI > 25 dan juga non-perokok: 7.80%


# **D. KORELASI VARIABEL**

In [31]:
# Semua korelasi antar variabel
all_correlation = data.corr(method="pearson")
all_correlation.round(2)

Unnamed: 0,age,bmi,children,charges
age,1.0,0.11,0.04,0.3
bmi,0.11,1.0,0.01,0.2
children,0.04,0.01,1.0,0.07
charges,0.3,0.2,0.07,1.0


# **E. UJI HIPOTESIS**

In [42]:
from scipy.stats import binom
import scipy.stats as stats
from statsmodels.stats.weightstats import ztest as ztest
from scipy.stats import ttest_ind
import math
from statsmodels.stats.proportion import proportions_ztest

**1. Hipotesis 1**

In [37]:
alpha = 0.05

# grup 1 = smoker
# grup 2 = non smoker
rataan1 = 32050
rataan2 = 8434

# Menentukan sample variance
s_x = 133207311
s_y = 35925420

# Ukuran sampel dua kelompok
sample1 = 274
sample2 = 1064

#derajat kebebasan
df1 = 274 -1
df2 = 1064 - 1

# Hitung uji statistik
# s_x menjadi pembilang karena lebih besar
f_test=round(s_x/s_y, 2)

print(f_test)
print("Dari program di atas hasil uji statistik adalah : ", f_test)

3.71
Dari program di atas hasil uji statistik adalah :  3.71


In [39]:
from scipy import stats

# Nilai Critical Value berdasarkan critical f-value dari f-table dengan nilai alpha 0.05
f_crit=round(stats.f.ppf(1-0.05, df1, df2), 2)
print(f_crit)
print("Dengan nilai uji statistik(f-test):", f_test," dan nilai f_crit:", f_crit)

1.17
Dengan nilai uji statistik(f-test): 3.71  dan nilai f_crit: 1.17


In [40]:
# Menentukan aturan keputusan
if f_test>f_crit:
  print(f"Tolak null hypothesis di taraf signifikansi 5% karena F > {f_crit} (F={f_test})")
  print('Varians tagihan perokok tidak sama dengan non-perokok')
else:
  print(f"Gagal tolak null hypothesis di taraf signifikansi 5% karena F < {f_crit} (F={f_test})")
  print('Variansi tagihan perokok sama dengan non-perokok')

Tolak null hypothesis di taraf signifikansi 5% karena F > 1.17 (F=3.71)
Varians tagihan perokok tidak sama dengan non-perokok


**2. Hipotesis 2**

In [43]:
# Menentukan jumlah kejadian ukuran sampel (n), proporsi pada null hypothesis, dan proporsi sample
n = 274
p = 0.50
p_hat = 159/274

# Hitung uji statistik
test_stat=(p_hat-p)/(math.sqrt((p*(1-p))/(n)))
print(test_stat)

#Interpretasi hasil
print("Dari program di atas hasil uji statistik adalah : ", test_stat)

2.658137210652779
Dari program di atas hasil uji statistik adalah :  2.658137210652779


In [44]:
# Nilai Critical Value berdasarkan critical Z-value dari Z-table dengan nilai alpha 0.05
#Import package
z_crit=stats.norm.ppf(1-0.05)  # right-tailed test
print(z_crit)

#Interpretasi hasil
print("Dengan nilai uji statistik", test_stat," dan nilai z_crit", stats.norm.ppf(1-0.05) )

1.6448536269514722
Dengan nilai uji statistik 2.658137210652779  dan nilai z_crit 1.6448536269514722


In [45]:
# Menentukan aturan keputusan
if test_stat>z_crit:
    print(f"Tolak null hypothesis di taraf signifikansi 5% karena Z > {z_crit:.2f} (Z = {test_stat:.2f})")
    print('Proporsi perokok laki-laki lebih besar dari proporsi perokok perempuan')
else:
    print("Gagal tolak null hypothesis di taraf signifikansi 5% karena Z < {z_crit:.2f} (Z = {test_stat:.2f})")
    print("Proporsi perokok laki-laki sama dengan proporsi perokok perempuan")

Tolak null hypothesis di taraf signifikansi 5% karena Z > 1.64 (Z = 2.66)
Proporsi perokok laki-laki lebih besar dari proporsi perokok perempuan


**3. Hipotesis 3**

In [46]:
smoker_array = np.array(data[data.smoker =='yes'].charges)
non_smoker_array = np.array(data[data.smoker =='no'].charges)

stat, p = ttest_ind(a = smoker_array, b = non_smoker_array, equal_var=False, alternative='greater') # equal_var= False karena varians kedua populasi berbeda

# Interpretasi Hasil
print(p)
print('Statistics = %.4f, p-value = %.4f' % (stat, p))

2.94473222335849e-103
Statistics = 32.7519, p-value = 0.0000


In [47]:
# Pengambilan Keputusan
if p > alpha:
    print(f'Gagal tolak null hypothesis di taraf signifikansi 5% karena P-value > {alpha} (P = {p:.2f})')
    print('Tagihan perokok sama dengan non-perokok')
else:
    print(f'Tolak null hypothesis di taraf signifikansi 5% karena P-value < {alpha} (P = {p:.2f})')
    print('Tagihan perokok lebih besar dari perokok')

Tolak null hypothesis di taraf signifikansi 5% karena P-value < 0.05 (P = 0.00)
Tagihan perokok lebih besar dari perokok
