<a href="https://colab.research.google.com/github/accarter/DS-Unit-1-Sprint-2-Statistics/blob/master/module1/LS_DS_121_Statistics_Probability_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

## *Data Science Unit 1 Sprint 2 Assignment 1*

# Apply the t-test to real data

Your assignment is to determine which issues have "statistically significant" differences between political parties in this [1980s congressional voting data](https://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records). The data consists of 435 instances (one for each congressperson), a class (democrat or republican), and 16 binary attributes (yes or no for voting for or against certain issues). Be aware - there are missing values!

Your goals:

1. Load and clean the data (or determine the best method to drop observations when running tests)
2. Using hypothesis testing, find an issue that democrats support more than republicans with p < 0.01
3. Using hypothesis testing, find an issue that republicans support more than democrats with p < 0.01
4. Using hypothesis testing, find an issue where the difference between republicans and democrats has p > 0.1 (i.e. there may not be much of a difference)

Note that this data will involve *2 sample* t-tests, because you're comparing averages across two groups (republicans and democrats) rather than a single group against a null hypothesis.

In [0]:
import pandas as pd
from scipy.stats import ttest_ind, ttest_ind_from_stats, ttest_rel

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data

--2020-05-21 21:14:12--  https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18171 (18K) [application/x-httpd-php]
Saving to: ‘house-votes-84.data.1’


2020-05-21 21:14:12 (286 KB/s) - ‘house-votes-84.data.1’ saved [18171/18171]



In [3]:
column_headers = ['party','handicapped-infants','water-project',
                          'budget','physician-fee-freeze', 'el-salvador-aid',
                          'religious-groups','anti-satellite-ban',
                          'aid-to-contras','mx-missile','immigration',
                          'synfuels', 'education', 'right-to-sue','crime','duty-free',
                          'south-africa']

house = pd.read_csv('house-votes-84.data', 
                 header=None, 
                 names=column_headers,
                 na_values="?")

house.head()

Unnamed: 0,party,handicapped-infants,water-project,budget,physician-fee-freeze,el-salvador-aid,religious-groups,anti-satellite-ban,aid-to-contras,mx-missile,immigration,synfuels,education,right-to-sue,crime,duty-free,south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
2,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y


In [4]:
# recode votes as numeric - allows for comparisons to be carried out with numbers
house = house.replace({
    'y': 1, 
    'n': 0
    })

house.head()

Unnamed: 0,party,handicapped-infants,water-project,budget,physician-fee-freeze,el-salvador-aid,religious-groups,anti-satellite-ban,aid-to-contras,mx-missile,immigration,synfuels,education,right-to-sue,crime,duty-free,south-africa
0,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,0.0,1.0
1,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,
2,democrat,,1.0,1.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,democrat,0.0,1.0,1.0,0.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,democrat,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,1.0


In [5]:
def print_house_stats():
  """
  Displays:
      (1) issues that democrats support more than republicans with p < 0.01
      (2) issues that republicans support more than democrats with p < 0.01
      (3) issues where the difference between republicans and democrats has p > 0.1
  """
  rep, dem = split_house(house)           # split house into two separate datasets
                                          # based on party lines

  # dictionary containing bill with t-statistic and p-value
  bill_stats =\
  {bill: ttest_ind(rep[bill], dem[bill], nan_policy='omit')\
   for bill in column_headers[1:]}

  print_bills_with_sig_diff(bill_stats)    # display bills where the support across 
                                           # parties is significantly different

  print_bills_with_no_sig_diff(bill_stats)  # display bills where there is no significant 
                                            # difference in support from either party

def split_house(house):
  """
  Produce two subsets by splitting original dataset on partisanship 
  (i.e. Republican vs. Democrat)
  """
  return (house[house['party'] == 'republican'], 
          house[house['party'] == 'democrat'])

def print_bills_with_sig_diff(bill_stats):
  """
  Display bills that receive more support from one party than another.
  """
  for party, party_fn in [('republican', lambda x: x > 0), ('democrat', lambda x: x < 0)]:
    print(f'more {party} support with p < 0.01')
    for bill, pvalue in bills_favored_by(bill_stats, 0.01, party_fn): 
      print("{:25} ( p = {:<12.6} )".format(bill, pvalue))
    print()

def print_bills_with_no_sig_diff(bill_stats):
  """
  Displays bills where there is little difference in support from either party.
  """
  print('issues where the difference between republicans and democrats has p > 0.1')
  for bill, pvalue in bills_with_large_p(bill_stats, 0.1):
    print("{:25} ( p = {:<12.6} )".format(bill, pvalue))

def bills_favored_by(bill_stats, alpha, favored_by_party):
  """
  Generator that produces all bills that have gained more 
  support from the specified party
  """
  for bill, (tstat, pvalue) in bill_stats.items():
    if favored_by_party(tstat) and pvalue < alpha:
      yield (bill, pvalue)

def bills_with_large_p(bill_stats, alpha):
  """
  Generator that produces all bills where there is not much of a difference
  between democrat vs republican support.
  """
  for bill, (tstat, pvalue) in bill_stats.items():
    if pvalue > alpha: 
      yield (bill, pvalue)

print_house_stats()


more republican support with p < 0.01
physician-fee-freeze      ( p = 1.99426e-177 )
el-salvador-aid           ( p = 5.60052e-68  )
religious-groups          ( p = 2.39367e-20  )
education                 ( p = 1.88342e-64  )
right-to-sue              ( p = 1.22786e-34  )
crime                     ( p = 9.95234e-47  )

more democrat support with p < 0.01
handicapped-infants       ( p = 1.61344e-18  )
budget                    ( p = 2.07034e-77  )
anti-satellite-ban        ( p = 8.52103e-31  )
aid-to-contras            ( p = 2.82472e-54  )
mx-missile                ( p = 5.03079e-47  )
synfuels                  ( p = 1.57593e-15  )
duty-free                 ( p = 5.9977e-32   )
south-africa              ( p = 3.65267e-11  )

issues where the difference between republicans and democrats has p > 0.1
water-project             ( p = 0.929156     )


## Stretch Goals:

1. Refactor your code into functions so it's easy to rerun with arbitrary variables
2. Work on Performing a T-test without using Scipy in order to get "under the hood" and learn more thoroughly about this topic.
### Start with a 1-sample t-test
 - Establish the conditions for your test 
 - [Calculate the T Statistic](https://blog.minitab.com/hs-fs/hubfs/Imported_Blog_Media/701f9c0efa98a38fb397f3c3ec459b66.png?width=247&height=172&name=701f9c0efa98a38fb397f3c3ec459b66.png) (You'll need to omit NaN values from your sample).
 - Translate that t-statistic into a P-value. You can use a [table](https://www.google.com/search?q=t+statistic+table) or the [University of Iowa Applet](https://homepage.divms.uiowa.edu/~mbognar/applets/t.html)

 ### Then try a 2-sample t-test
 - Establish the conditions for your test 
 - [Calculate the T Statistic](https://lh3.googleusercontent.com/proxy/rJJ5ZOL9ZDvKOOeBihXoZDgfk7uv1YsRzSQ1Tc10RX-r2HrRpRLVqlE9CWX23csYQXcTniFwlBg3H-qR8MKJPBGnjwndqlhDX3JxoDE5Yg) (You'll need to omit NaN values from your sample).
 - Translate that t-statistic into a P-value. You can use a [table](https://www.google.com/search?q=t+statistic+table) or the [University of Iowa Applet](https://homepage.divms.uiowa.edu/~mbognar/applets/t.html)

 ### Then check your Answers using Scipy!

In [0]:
import numpy as np
from scipy.stats import ttest_1samp

In [0]:
def t_statistic(xbar, mu, std_err):
  return (xbar - mu) / std_err

def std_err(sample, pop=None):
  n = len(sample)
  if pop is not None:
    n = len(pop)
  return np.std(sample, ddof=1) / np.sqrt(n)

In [0]:
def t_statistic_1samp(samp, h0):
  clean_samp = samp.dropna()
  return t_statistic(np.mean(clean_samp),
                     h0,
                     std_err(clean_samp))

In [0]:
dem = house[house['party'] == 'democrat']
rep = house[house['party'] == 'republican']

1-sample t-test demo

In [10]:
t_statistic_1samp(dem['south-africa'], .9)

1.935123083617057

In [11]:
# translate t-statistic into a P-value using the University of Iowa Applet
dof = len(dem['south-africa'].dropna()) - 1
dof

# University of Iowa Applet P-value: 0.0545

184

In [12]:
# verify results with scipy
ttest_1samp(dem['south-africa'], .9, nan_policy='omit')

Ttest_1sampResult(statistic=1.935123083617057, pvalue=0.05450836316336539)

In [13]:
t_statistic_1samp(dem['south-africa'], .5)

23.965755112488182

In [0]:
# translate t-statistic into a P-value using the University of Iowa Applet
# University of Iowa Applet P-value: 0

In [15]:
# verify results with scipy
ttest_1samp(dem['south-africa'], .5, nan_policy='omit')

Ttest_1sampResult(statistic=23.965755112488182, pvalue=1.7513759267849718e-58)

2-sample t-test demo

In [0]:
def t_statistic_2samp(a, b):
  a = a.dropna()
  b = b.dropna()
  n1 = len(a)
  n2 = len(b)
  num = np.mean(a) - np.mean(b)
  sp2 = (((n1-1) * np.std(a) ** 2) + (n2-1) * np.std(b) ** 2) / (n1 + n2 - 2)
  denom = sp2 ** 0.5 * (1/n1 + 1/n2) ** 0.5
  
  return num / denom

In [17]:
t_statistic_2samp(rep['right-to-sue'], dem['right-to-sue'])

13.54182323287385

In [18]:
# translate t-statistic into a P-value using the University of Iowa Applet
dof = len(dem['right-to-sue'].dropna()) + len(rep['right-to-sue'].dropna()) - 2
dof

# University of Iowa Applet P-value: 0

408

In [19]:
# verify results with scipy
ttest_ind(rep['right-to-sue'], dem['right-to-sue'], nan_policy='omit')

Ttest_indResult(statistic=13.51064251060933, pvalue=1.2278581709672758e-34)