In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import (t, norm)

# ___Comparing Proportions for Two Independent Samples___
---------------

In [2]:
data = pd.read_csv("../data/nhanes_2015_2016.csv", usecols = ["SMQ020", "RIAGENDR", "RIDAGEYR", "RIDRETH1"]).dropna(axis = 0)

In [3]:
np.sort(data.RIDAGEYR.unique())

array([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80], dtype=int64)

In [4]:
data.shape

(5735, 4)

In [5]:
data.SMQ020.unique()

array([1, 2, 7, 9], dtype=int64)

__SMQ020: Smoked at least 100 cigarettes in life__

1. 	Yes  
2. 	No  
7. 	Refused  	 	
9. 	Don't know     

In [6]:
# RIDRETH1 = 1 -> Mexican americans
# RIDRETH1 = 2 -> other hispanics

# data that includes only 80+ hispanic adults.

data = data.loc[(data.RIDAGEYR >= 80) & ((data.RIDRETH1 == 1) | (data.RIDRETH1 == 2)), :]
data.head()

Unnamed: 0,SMQ020,RIAGENDR,RIDAGEYR,RIDRETH1
43,2,2,80,2
80,2,2,80,1
261,2,2,80,2
521,1,1,80,1
557,1,2,80,2


In [45]:
data.shape

(49, 4)

In [35]:
# Q. Does the proportions of 80+ aged hispanic males and females living in the US in 2015 - 2016 who smoked significantly differ significantly?

## ___Using Confidence Intervals___
--------------------

In [49]:
# sample sizes of hsipanic males and females aged 80+, living in the US in 2015 - 2016

ssize_m, ssize_f = (data.RIAGENDR == 1).sum(), (data.RIAGENDR == 2).sum()
ssize_m, ssize_f

(17, 32)

In [8]:
# proportion of males and females who smoked at least 100 cigarettes in their life in the samples of hsipanic males and females aged 80+, 
# living in the US in 2015 - 2016

prop_m = ((data.RIAGENDR == 1) & (data.SMQ020 == 1)).sum() / (data.RIAGENDR == 1).sum()
prop_f = ((data.RIAGENDR == 2) & (data.SMQ020 == 1)).sum() / (data.RIAGENDR == 2).sum()

prop_m, prop_f

(0.5294117647058824, 0.25)

In [9]:
# HOWEVER, THE SAMPLE SIZES HERE ARE EXTREMELY SMALL!

In [10]:
# best estimate is the difference in sample proportions;

best_est = prop_m - prop_f
best_est

0.27941176470588236

In [13]:
# confidence interval = best estimate +- margin of error
# margin of error = multiplier * standard error

tscore = t.ppf(0.975, df = 16)
tscore

2.119905299221011

In [None]:
# Let's check our assumptions,
# Are the populations that these samples come from normal?
# Do we have at least 10 values of each type in each samples?
# Are the two samples independent?
# Are the observations independent?