# Read in the Jeopardy Dataset

In [1]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")

jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value','Question', 'Answer']
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


# Clean Up Dataset

In [2]:
import string

def string_norm(str):
    str = str.lower()
    exclude = set(string.punctuation)
    s = ''.join(ch for ch in str if ch not in exclude)
    return s
jeopardy['clean_question'] = jeopardy['Question'].apply(string_norm)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(string_norm)


In [3]:
def dollar_norm(str):
    exclude = set(string.punctuation)
    s = ''.join(ch for ch in str if ch not in exclude)
    try:
        x = int(s)
    except ValueError:
        x = 0
        
    return x

jeopardy['clean_value'] = jeopardy['Value'].apply(dollar_norm)
print(jeopardy['clean_value'].head(5))
    
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])    
jeopardy.dtypes   

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64


Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

# Determine if the answer is within the question

In [9]:
def answ(ser):
    split_answer = ser['clean_answer'].split(' ')
    split_question = ser['clean_question'].split(' ')
    match_count = 0
    while 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for x in split_answer:
        if x in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(answ, axis = 1)
print(jeopardy["answer_in_question"].mean())
            

0.00610030501525


In [34]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [ques for ques in split_question if len(ques) > 5]
    match_count = 0
    for y in split_question:
        if y in terms_used:
            match_count += 1
        else:
            terms_used.add(y)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())
print('Length of terms list: ', len(terms_used))
            

0.313465673284
('Length of terms list: ', 24564)


In [19]:
def value(ser):
    if ser['clean_value'] > 800:
        val = 1
    else:
        val = 0
    return val

jeopardy['high_value'] = jeopardy.apply(value, axis = 1)

def hlcount(st):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if st in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:20]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(hlcount(term))

observed_expected

[(0, 1),
 (0, 2),
 (0, 1),
 (2, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (2, 0),
 (0, 1),
 (0, 2),
 (5, 5),
 (0, 2),
 (1, 0),
 (1, 3),
 (0, 1),
 (1, 0),
 (6, 7),
 (1, 3),
 (0, 1)]

# Chisquared Test on Value Counts

In [31]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]
chi_squared = []
for x in observed_expected:
    total = sum(x)
    total_prop = float(total) / jeopardy.shape[0]
    high_value_exp = float(total_prop) * high_value_count
    low_value_exp = float(total_prop) * low_value_count
    
    observed = np.array([x[0], x[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))
    print(total_prop)
chi_squared

5.0002500125e-05
0.00010000500025
5.0002500125e-05
0.000150007500375
5.0002500125e-05
5.0002500125e-05
5.0002500125e-05
5.0002500125e-05
0.00010000500025
5.0002500125e-05
0.00010000500025
0.00050002500125
0.00010000500025
5.0002500125e-05
0.0002000100005
5.0002500125e-05
5.0002500125e-05
0.000650032501625
0.0002000100005
5.0002500125e-05


[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.1177104383031944, pvalue=0.14560406868264344),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=4.9755842343913503, pvalue=0.025707519787911092),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=2.2243874083063973, pvalue=0.13584652879916

As can be seen, the frequencies of words that meet the required criteria (>5 letters, high | low value) are not sufficient enough to produce meaningful chisquare data. 