In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import (t, norm, chi2)

# ___Comparing Proportions for Two Independent Samples___
---------------

In [2]:
data = pd.read_csv("../data/nhanes_2015_2016.csv", usecols = ["SMQ020", "RIAGENDR", "RIDAGEYR", "RIDRETH1"]).dropna(axis = 0)

In [3]:
np.sort(data.RIDAGEYR.unique())

array([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80], dtype=int64)

In [4]:
data.shape

(5735, 4)

In [5]:
data.SMQ020.unique()

array([1, 2, 7, 9], dtype=int64)

__SMQ020: Smoked at least 100 cigarettes in life__

1. 	Yes  
2. 	No  
7. 	Refused  	 	
9. 	Don't know     

In [6]:
# RIDRETH1 = 1 -> Mexican americans
# RIDRETH1 = 2 -> other hispanics

# data that includes only 80+ hispanic adults.

data = data.loc[(data.RIDAGEYR >= 80) & ((data.RIDRETH1 == 1) | (data.RIDRETH1 == 2)), :]
data.head()

Unnamed: 0,SMQ020,RIAGENDR,RIDAGEYR,RIDRETH1
43,2,2,80,2
80,2,2,80,1
261,2,2,80,2
521,1,1,80,1
557,1,2,80,2


In [7]:
data.shape

(49, 4)

## ___Q. Does the proportions of 80+ aged hispanic males and females living in the US in 2015 - 2016 who smoked significantly differ significantly?___

## ___Using Confidence Intervals___
--------------------

In [8]:
# sample sizes of hsipanic males and females aged 80+, living in the US in 2015 - 2016

ssize_m, ssize_f = (data.RIAGENDR == 1).sum(), (data.RIAGENDR == 2).sum()
ssize_m, ssize_f

(17, 32)

In [9]:
# NOTE THE EXTREMELY SMALL SAMPLE SIZES!.

In [10]:
# proportion of males and females who smoked at least 100 cigarettes in their life in the samples of hsipanic males and females aged 80+, 
# living in the US in 2015 - 2016

prop_m = ((data.RIAGENDR == 1) & (data.SMQ020 == 1)).sum() / (data.RIAGENDR == 1).sum()
prop_f = ((data.RIAGENDR == 2) & (data.SMQ020 == 1)).sum() / (data.RIAGENDR == 2).sum()

prop_m, prop_f

(0.5294117647058824, 0.25)

In [11]:
# HOWEVER, THE SAMPLE SIZES HERE ARE EXTREMELY SMALL!

In [12]:
# best estimate is the difference in sample proportions;

best_est = prop_m - prop_f
best_est

0.27941176470588236

In [13]:
# confidence interval = best estimate +- margin of error
# margin of error = multiplier * standard error

tscore = t.ppf(0.975, df = min(ssize_m, ssize_f) - 1)
tscore

2.119905299221011

### ___Let's check our assumptions___

In [20]:
# 1) Are the populations that these samples come from normal? Don't know.
# 2) Do we have at least 10 values of each type in each samples? Nope! (our sample sizes are 32 and 17), one sample difinitely contains one or both 
# variants in quantities less than 0.

# for males;
((data.RIAGENDR == 1) & (data.SMQ020 == 1)).sum(), ((data.RIAGENDR == 1) & (data.SMQ020 != 1)).sum()

(9, 8)

In [18]:
# There we go,
# So in our males sample, 9 of them smoked significantly and 8 of them did not smoke significantly!
# So, both of these two variants fail to realize a sample size of 10.

In [21]:
# 3) Are the two samples independent? Yes.
# 4) Are the observations independent? Supposedly yes!.

In [24]:
# estimated standard errors

stderr_m = np.sqrt(prop_m * (1 - prop_m) / ssize_m)
stderr_f = np.sqrt(prop_f * (1 - prop_f) / ssize_f)

stderr_m, stderr_f

(0.12105782480647682, 0.07654655446197431)

In [25]:
stderr_comb = stderr_m + stderr_f
stderr_comb

0.19760437926845115

In [33]:
# margin of error, assuming that the sampling distribution is normal!

zstar = norm.ppf(0.975)
zstar

1.959963984540054

In [34]:
moerr = stderr_comb * zstar
moerr

0.38729746655355757

In [35]:
# 95% confidence interval;

lcb, ucb = best_est - moerr, best_est + moerr
lcb, ucb

(-0.10788570184767521, 0.6667092312594399)

In [32]:
# Our confidence interval does include 0!
# So, Null hypothesis's proposal of 0.0 difference in the proportions is an entertainable option!
# There is perhaps no significant difference between the males and females who smoked.

## ___Using Hypothesis Testing: $\chi^2$ test___
--------------------

In [39]:
# Significance level 5% (0.05)

# Null hypothesis: There is no significant difference in the proportions of 80+ aged hispanic males and females living in US, in 2015 - 2016
# who smoked significantly.

# Alternative hypothesis: There is a significant difference in the proportions of 80+ aged hispanic males and females living in US, in 2015 - 2016
# who smoked significantly. (two tailed!)

In [43]:
# Assumptions:

# 1) Chi squared test involves a 2x2 matrix, that consists of sample sizes for two variants of values present in the samples.
# e.g. [[n(smoking-males), n(non-smoking-males)], [n(smoking-females), n(non-smoking-females)]]
# All these 4 values need to be greater than 5!

((data.RIAGENDR == 1) & (data.SMQ020 == 1)).sum(), ((data.RIAGENDR == 1) & (data.SMQ020 != 1)).sum(), \
((data.RIAGENDR == 2) & (data.SMQ020 == 1)).sum(), ((data.RIAGENDR == 2) & (data.SMQ020 != 1)).sum()

(9, 8, 8, 24)

In [44]:
# The above assumption is justified.

In [45]:
# We'll assume that the observations are independent!

# ___$\chi^2 = \sum \frac{(O - E)^2}{E}$___

In [52]:
# O, E denote the observed and expected FREQUENCIES NOT PROPORTIONS!

In [57]:
# SOMETHING WRONG WITH THE CHI SQUARED TEST RESULTS :&(

In [58]:
# Let's say that we expected the females' proportion to be same as the males' proportion.

msmoke = ((data.RIAGENDR == 1) & (data.SMQ020 == 1)).sum()
fsmoke = ((data.RIAGENDR == 2) & (data.SMQ020 == 1)).sum()

chi2_stat = (((msmoke - msmoke) ** 2) / msmoke) + (((fsmoke - msmoke) ** 2) / msmoke)
chi2_stat

0.1111111111111111

In [59]:
2 * (1 - chi2.cdf(chi2_stat, df = 1))

1.4777653607270547