In [8]:
import pandas as pd 
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [9]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [10]:
#Remove the spaces in each item in jeopardy.columns.
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [11]:
#normalize all of the text columns (the Question and Answer columns). 
import re
def normalize_text(str):
    str = str.lower()
    str = re.sub("[^A-Za-z0-9\s]", "", str)
    return str
    
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)
    
    

In [12]:
#The Value column should be numeric, need to remove the dollar sign from the beginning of each value and convert the column from text to numeric.
import re
import pandas
def normalize_value(str):
    str = re.sub("[^A-Za-z0-9\s]", "", str)
    try:
        str = int(str)
    except Exception:
        str = 0
    return str

jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_value)
jeopardy["Air Date"] = pandas.to_datetime(jeopardy["Air Date"])

In [13]:
#How often the answer is deducible from the question.
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)

jeopardy["answer_in_question"].mean()



0.06049325706933587

The answer only appears in the question about 6% of the time. This isn't a huge number, and we can't just hope that a question will enable us to figure out the answer. We'll have to study.

In [14]:
#investigate how often new questions are repeats of older ones.
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question =  [item for item in split_question if len(item) > 5]
    match_count = 0
    for item in split_question:
        if item in terms_used:
            match_count += 1
        else:
            terms_used.add(item)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()


    


0.6925960057338647

There is about 70% overlap between terms in new questions and terms in old questions. It does mean that it's worth looking more into the recycling of questions.

In [15]:
# only want to study questions that pertain to high value questions instead of low value questions.
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)
                

In [16]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(0, 1), (3, 0), (1, 3), (0, 1), (0, 2)]

In [17]:
high_value_count = len(jeopardy[jeopardy['high_value']==1])
low_value_count = len(jeopardy[jeopardy['high_value']==0])
chi_squared = []


In [18]:
from scipy.stats import chisquare
import numpy as np

for item in observed_expected:
    total = sum(item)
    total_prop = total/jeopardy.shape[0]
    
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([item[0], item[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared
    

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=7.463376351587025, pvalue=0.006296679668748999),
 Power_divergenceResult(statistic=0.02636443308440769, pvalue=0.871013484688921),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571)]

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.