# Jeopardy Hacking

Here we have a dataset of historical questions from the gameshow Jeopardy

The goal is to determine any trends in the questions asked on Jeopardy to determine if we can gain an edge if we were to compete on the show

# Import data

In [1]:
import pandas as pd
import numpy as np
import random

jeopardy = pd.read_csv('jeopardy.csv')
print(jeopardy.head(3))


   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  


# Clean data

### Clean columns

In [2]:
print(jeopardy.columns)


Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [3]:
jeopardy.columns = jeopardy.columns.str.strip().str.replace(" ","_").str.lower()


In [4]:
print(jeopardy.columns)


Index(['show_number', 'air_date', 'round', 'category', 'value', 'question',
       'answer'],
      dtype='object')


### Clean 'question' and 'answer' columns

In [5]:
print(jeopardy.question[:5])
print(jeopardy.answer[:5])


0    For the last 8 years of his life, Galileo was ...
1    No. 2: 1912 Olympian; football star at Carlisl...
2    The city of Yuma in this state has a record av...
3    In 1963, live on "The Art Linkletter Show", th...
4    Signer of the Dec. of Indep., framer of the Co...
Name: question, dtype: object
0    Copernicus
1    Jim Thorpe
2       Arizona
3    McDonald's
4    John Adams
Name: answer, dtype: object


In [6]:
"""

Remove any non-word characters to create a dictionary of words used

"""

jeopardy['question'] = jeopardy.question.str.replace('\W', ' ', regex=True).str.replace('  ', ' ').str.lower()
jeopardy['answer'] = jeopardy.answer.str.replace('\W', ' ', regex=True).str.replace('  ', ' ').str.lower()


In [7]:
print(jeopardy.question[:5])
print(jeopardy.answer[:5])


0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show  this ...
4    signer of the dec of indep  framer of the cons...
Name: question, dtype: object
0    copernicus
1    jim thorpe
2       arizona
3    mcdonald s
4    john adams
Name: answer, dtype: object


### Clean question 'value' column

In [8]:
jeopardy['value'] = jeopardy.value.replace('None', np.nan).str.replace('$', '').str.replace(',', '').astype(float)
print(jeopardy.value.value_counts(dropna=False))


400.0     3892
800.0     2980
200.0     2784
1000.0    1980
600.0     1890
          ... 
7400.0       1
1492.0       1
6200.0       1
6800.0       1
3389.0       1
Name: value, Length: 72, dtype: int64


  jeopardy['value'] = jeopardy.value.replace('None', np.nan).str.replace('$', '').str.replace(',', '').astype(float)


### Clean 'air_date' column

In [9]:
jeopardy['air_date'] = pd.to_datetime(jeopardy.air_date)
print(jeopardy.air_date[:5])


0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: air_date, dtype: datetime64[ns]


# Determine if the answers are ever in the questions

In [10]:
"""

This function determines how much of the answer is in the question as a ratio

"""

def count_matches(row):
    split_answer = row['answer'].split()
    split_question = row['question'].split()
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)


In [11]:
jeopardy['answer_in_question'] = jeopardy.apply(count_matches, axis=1)
print(jeopardy['answer_in_question'].value_counts())


0.000000    17247
0.500000     1449
0.333333      622
0.250000      204
1.000000      128
0.666667      118
0.200000       72
0.166667       37
0.400000       34
0.750000       20
0.142857       20
0.125000       12
0.600000       10
0.285714        7
0.428571        4
0.800000        3
0.571429        2
0.111111        2
0.181818        2
0.307692        1
0.300000        1
0.444444        1
0.375000        1
0.222222        1
0.350000        1
Name: answer_in_question, dtype: int64


In [12]:
print(jeopardy['answer_in_question'].mean())


0.06294645581984942


#### 6.3% of the time the answer is in the question. This is not enough to gain an edge in the game, but does indicate that certain topics may be recycled

# Are Jeopardy questions recycled

In [13]:
question_overlap = []
terms_used = set()
h = 0
for index, row in jeopardy.iterrows():
    split_question = row['question'].split()
    match_count = 0
    for word in split_question:
        if len(word) > 5:                 # using words with more than 5 letters
            if word in terms_used:
                match_count += 1
            terms_used.add(word)
    if len(split_question) > 0:
        question_overlap.append(match_count / len(split_question))
jeopardy['question_overlap'] = question_overlap
    
print(np.mean(question_overlap))


0.21574382782134924


#### 21.5% of the terms used in the questions have overlap, this clearly indicates that some topics are more asked about than others and there may be certain topics worth studying more than others

# Correlating high value points with recycled terms

In [14]:
"""

This function differentiates high value questions
(questions worth more than $800)
and low value questions

"""

def high_value(row):
    if row['value'] > 800:
        return 1
    else:
        return 0


In [15]:
# Verifying function high_value(row)

jeopardy['high_value_question'] = jeopardy.apply(high_value, axis=1)
print(jeopardy['high_value_question'].value_counts())
print(len(jeopardy['value'][jeopardy['value'] > 800]))


0    14265
1     5734
Name: high_value_question, dtype: int64
5734


In [16]:
"""

This function determines how many times a word is in a question

The count is returned for the high value questions and the low value questions

"""


def word_count(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        split_question = row.question.split()
        if word in split_question:
            if row.high_value_question == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count
    

In [17]:
"""

Create a sample to compare back to the dataset and determine chi-squared

"""

terms_list = list(terms_used)
comparison_terms = random.sample(terms_list, 10)
observed_expected = []
for term in comparison_terms:
    observed_expected.append(word_count(term))


In [18]:
print(observed_expected)


[(0, 2), (1, 1), (6, 10), (1, 0), (0, 1), (1, 1), (3, 1), (1, 2), (1, 0), (0, 1)]


In [19]:
from scipy.stats import chisquare

high_value_count = len(jeopardy[jeopardy.high_value_question == 1])
low_value_count = len(jeopardy[jeopardy.high_value_question == 0])
chi_squared = []
for items in observed_expected:
    total = sum(items)
    total_prop = total/len(jeopardy)
    exp_high = total_prop * high_value_count
    exp_low = total_prop * low_value_count

    observed = np.array([items[0], items[1]])
    expected = np.array([exp_high, exp_low])
    chi_squared.append(chisquare(observed, expected))


In [22]:
for i in chi_squared:
    if i.pvalue < .04:    # setting p-value to 0.4
        print(i.pvalue)

# Conclusion:
In this small sample, there doesn't appear to be a correlation between recycled questions and question value. 