# Hypothesis Testing Project: Jeopardy

In this project we will work with a <a href="https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file">full dataset of Jeopardy questions</a>.
It contains 20,000 rows.

In [185]:
import pandas as pd
import numpy as np
import re
import random
from scipy.stats import chisquare

jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head(5))


   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


In [186]:
print(jeopardy.columns)
jeopardy.columns = jeopardy.columns.str.replace(" ", "")

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [187]:
def normalize(word):
    word = word.lower()
    word = re.sub(r"[^\w\s]", "", word)
    return word
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
print(jeopardy["clean_question"].head(5))

jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)
print(jeopardy["clean_answer"].head(5))


0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object
0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object


In [188]:
def remove(word):
    word = re.sub(r"[^0-9]", "", word)
    try:
        word = int(word)
    except Exception:
        word = 0
    return word
jeopardy["clean_value"] = jeopardy["Value"].apply(remove)

jeopardy["AirDate"] = pd.to_datetime(jeopardy["AirDate"])

print(jeopardy["clean_value"].head(5))

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64


In [189]:
question_overlap = []
terms_used = set()
jeopardy = jeopardy.sort_values(by = ["AirDate"], ascending=True)
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for i in split_question:
        if i in terms_used:
            match_count += 1
        terms_used.add(i)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap
print(jeopardy["question_overlap"].mean())              

0.6894006357823182


In [190]:
def values(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1    
    return value
jeopardy["high_value"] = jeopardy.apply(values, axis=1)

def value_count(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
        return high_count, low_count
comparison_terms = random.sample(terms_used, k=10)
observed_expected = []
for i in comparison_terms:
    observed_expected.append(value_count(i))
    
print(observed_expected)\

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]
chi_squared = []

for i in observed_expected:
    total = sum(i)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    observed = np.array([i[0], i[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))  

    

    

[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]


  terms = (f_obs - f_exp)**2 / f_exp
