In [2]:
import pandas as pd
jeopardy = pd.read_csv("JEOPARDY_CSV.csv")
print(jeopardy.head())

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


In [3]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [4]:
print("row number",jeopardy.shape[0])
print("column number",jeopardy.shape[1])

row number 216930
column number 7


In [5]:
jeopardy.columns = jeopardy.columns.str.replace(" ","")
print(jeopardy.columns)

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [6]:
import re

def norm_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]","",text)
    return text
def norm_value(text):
    text = re.sub("[^A-Za-z0-9\s]","",text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text
jeopardy["clean_question"] = jeopardy["Question"].apply(norm_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(norm_text)
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_value)
jeopardy["AirDate"] = pd.to_datetime(jeopardy["AirDate"])

In [7]:
print(jeopardy.head())

   ShowNumber    AirDate      Round                         Category Value  \
0        4680 2004-12-31  Jeopardy!                          HISTORY  $200   
1        4680 2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200   
2        4680 2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...  $200   
3        4680 2004-12-31  Jeopardy!                 THE COMPANY LINE  $200   
4        4680 2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES  $200   

                                            Question      Answer  \
0  For the last 8 years of his life, Galileo was ...  Copernicus   
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe   
2  The city of Yuma in this state has a record av...     Arizona   
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's   
4  Signer of the Dec. of Indep., framer of the Co...  John Adams   

                                      clean_question clean_answer  clean_value  
0  for the last 8 years of his life galil

In [8]:
print(jeopardy.iloc[1]["Question"])

No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves


In [9]:
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)
  

In [10]:
print(jeopardy["answer_in_question"].mean())

0.059357587183968614


answers appaers in %5.9 of questions

In [11]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        split_question = [q for q in split_question if len(q) > 5]
        match_count = 0
        for word in split_question:
            if word in terms_used:
                match_count += 1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            match_count /= len(split_question)
        question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()

0.8735125558086732

There is about 87% overlap between terms in new questions and terms in old questions

In [12]:
def chis(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value

jeopardy["high_value"] = jeopardy.apply(chis,axis=1)


In [13]:
def wordcount(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return low_count, high_count  

In [14]:
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[0:5]
for i in comparison_terms:
    observed_expected.append(wordcount(i))
observed_expected

[(4, 1), (7, 1), (1, 0), (1, 0), (1, 0)]

In [19]:
from scipy.stats import chisquare
import numpy as np
high_value_count = len(jeopardy[jeopardy["high_value"] == 1])
low_value_count = len(jeopardy[jeopardy["high_value"] == 0])
chi_squared = []
for i in observed_expected:
    total = sum(i)
    total_prop = total/jeopardy.shape[0]
    expected_high = total_prop*high_value_count 
    expected_low = total_prop*low_value_count
    observed = np.array([i[0],i[1]])
    expected = np.array([expected_high,expected_low])
    chi_squared.append(chisquare(observed,expected))
chi_squared

[Power_divergenceResult(statistic=6.5807438519948587, pvalue=0.010308780400962616),
 Power_divergenceResult(statistic=13.806625159523749, pvalue=0.00020262047598010479),
 Power_divergenceResult(statistic=2.5317964247338085, pvalue=0.11157312838169751),
 Power_divergenceResult(statistic=2.5317964247338085, pvalue=0.11157312838169751),
 Power_divergenceResult(statistic=2.5317964247338085, pvalue=0.11157312838169751)]