In [7]:
from __future__ import print_function, division

%matplotlib inline

import numpy as np

import random
import first
import nsfg

import thinkstats2
import thinkplot

#Creating dataframe preg and live. Preg is our initial dataframe and creating live to look at all live births within preg
preg = nsfg.ReadFemPreg()
live = preg[preg.outcome == 1]

#Pg 102 in the text shows the definition for hypothesis test which is under class HypothesisTest
class HypothesisTest(object):
    #_init_ takes the data in whatever form is appropriate
    def __init__(self, data):
        self.data = data
        self.MakeModel()
        self.actual = self.TestStatistic(data)
    #Pvalue computes the probability of the apparent effect under the null hypthesis 
    def PValue(self, iters=1000):
        self.test_stats = [self.TestStatistic(self.RunModel()) 
                           for _ in range(iters)]

        count = sum(1 for x in self.test_stats if x >= self.actual)
        return count / iters

    def TestStatistic(self, data):
        raise UnimplementedMethodException()

    def MakeModel(self):
        pass

    def RunModel(self):
        raise UnimplementedMethodException()
        
        
#Testing a difference in means with permutation. Permutation takes two smaller groups and shuffles them together to create one large group. (Pg 104)
class DiffMeansPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = abs(group1.mean() - group2.mean())
        return test_stat

    def MakeModel(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1, group2))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data
    
#The CorrelationPermutation is how we will test correlation (pg 107).
class CorrelationPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        xs, ys = data
        test_stat = abs(thinkstats2.Corr(xs, ys))
        return test_stat

    def RunModel(self):
        xs, ys = self.data
        xs = np.random.permutation(xs)
        return xs, ys
    
#Chi-Squared allows us to test proportions. Pg 109 squaring the deviations gives more weight to large deviations.
class PregLengthTest(thinkstats2.HypothesisTest):

    def MakeModel(self):
        firsts, others = self.data
        self.n = len(firsts)
        self.pool = np.hstack((firsts, others))

        pmf = thinkstats2.Pmf(self.pool)
        self.values = range(35, 44)
        self.expected_probs = np.array(pmf.Probs(self.values))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data
    
    def TestStatistic(self, data):
        firsts, others = data
        stat = self.ChiSquared(firsts) + self.ChiSquared(others)
        return stat

    def ChiSquared(self, lengths):
        hist = thinkstats2.Hist(lengths)
        observed = np.array(hist.Freqs(self.values))
        expected = self.expected_probs * len(lengths)
        stat = sum((observed - expected)**2 / expected)
        return stat

#Using the class from chap09soln to combine the tests into one function
def RunTests(live, iters=1000):
    n = len(live)
    firsts = live[live.birthord == 1]
    others = live[live.birthord != 1]

    # Permutation - difference in means
    data = firsts.prglngth.values, others.prglngth.values
    ht = DiffMeansPermute(data)
    p1 = ht.PValue(iters=iters)

    data = (firsts.totalwgt_lb.dropna().values,
            others.totalwgt_lb.dropna().values)
    ht = DiffMeansPermute(data)
    p2 = ht.PValue(iters=iters)

    # Correlation
    live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    data = live2.agepreg.values, live2.totalwgt_lb.values
    ht = CorrelationPermute(data)
    p3 = ht.PValue(iters=iters)

    #Chi-squared
    data = firsts.prglngth.values, others.prglngth.values
    ht = PregLengthTest(data)
    p4 = ht.PValue(iters=iters)

    print('%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f' % (n, p1, p2, p3, p4))
    
n = len(live)
for _ in range(7):
    sample = thinkstats2.SampleRows(live, n)
    RunTests(sample)
    n //= 2
    
#So I've run the code a few different times and get slight differences in my outcomes, but overall I'm pretty close to what the solution code has. 
#n        Test1    Test2    Test3    Test4
#9148  0.17      0.00     0.00     0.00
#4574  0.28      0.01     0.00     0.00
#2287  0.27      0.12     0.00     0.00
#1143  0.21      0.40     0.01     0.00
#571    0.54      0.05     0.12     0.86
#285    0.35      0.70     0.98     0.97
#142    0.84      0.25     0.88     0.30

#While I see some patterns in the answers, I cannot say with confidence that as the sample size grows or decreases that the test results show positive or negative effects as expected.

9148	0.17	0.00	0.00	0.00
4574	0.28	0.01	0.00	0.00
2287	0.27	0.12	0.00	0.00
1143	0.21	0.40	0.01	0.00
571	0.54	0.05	0.12	0.86
285	0.35	0.70	0.98	0.97
142	0.84	0.25	0.88	0.30
