In [32]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import re
import random

In [2]:
df = pd.read_csv('jeopardy.csv')

In [3]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
df.shape

(19999, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
 Air Date      19999 non-null object
 Round         19999 non-null object
 Category      19999 non-null object
 Value         19999 non-null object
 Question      19999 non-null object
 Answer        19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [6]:
df.columns = df.columns.str.replace(' ', '')

In [7]:
df.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [8]:
def normalize_string (string):
    string = string.lower()
    string = re.sub(r'[\']', '', string)
    string = re.sub(r'\W', ' ', string)
    return string   

In [9]:
df['Question'] = df['Question'].str.replace(r'<.*?>','')

In [10]:
df['CleanQuestion'] = df['Question'].apply(normalize_string)

In [11]:
df['CleanQuestion']

0        for the last 8 years of his life  galileo was ...
1        no  2  1912 olympian  football star at carlisl...
2        the city of yuma in this state has a record av...
3        in 1963  live on  the art linkletter show   th...
4        signer of the dec  of indep   framer of the co...
                               ...                        
19994    of 8  12 or 18  the number of u s  states that...
19995                             the new power generation
19996    in 1589 he was appointed professor of mathemat...
19997    before the grand jury she said   im really sor...
19998    llamas are the heftiest south american members...
Name: CleanQuestion, Length: 19999, dtype: object

In [12]:
df['CleanAnswer'] = df['Answer'].apply(normalize_string)

In [13]:
def normalize_dollar_values (string):
    string = re.sub('[^\w\s]','',string)
    try:
        integer = int(string)
    except Exception:
        integer = 0
    return integer

In [14]:
df['CleanValue'] = df['Value'].apply(normalize_dollar_values)

In [15]:
df['AirDate'] = pd.to_datetime(df['AirDate'])

In [16]:
def match_question_answer (row_as_series):
    split_answer = row_as_series['CleanAnswer'].split()
    split_question = row_as_series['CleanQuestion'].split()
    match_count = 0
    while 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for i in split_answer:
        if i in split_question:
            match_count += 1
    return match_count / len(split_answer)

In [17]:
df['AnswerInQuestion'] = df.apply(match_question_answer, axis=1)

In [18]:
df['AnswerInQuestion'].mean()

0.05775619269918984

As we could have predicted, answers almost never appear in Jeopardy questions (< 6% of the time). We'll need to study to do well.

In [19]:
question_overlap = []
terms_used = set()
df = df.sort_values('AirDate')

In [20]:
for index, row in df.iterrows():
    split_question = row['CleanQuestion'].split()
    split_question = [word for word in split_question if len(word) >= 6]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

In [21]:
df['QuestionOverlap'] = pd.Series(question_overlap)

In [22]:
print(df['QuestionOverlap'].mean())

0.7154379799156523


Almost 70% of questions have recycled terms from previous questions, when looking only at words longer than 6 letters. This may include some relatively common and unspecific words, but is worth deeper examination. 

In [23]:
def categorize_value (row):
    if row['CleanValue'] > 800:
        value = 1
    else:
        value = 0
    return value

In [24]:
df['HighValue'] = df.apply(categorize_value, axis=1)

In [25]:
def word_value (word):
    low_count = 0
    high_count = 0
    for index, row in df.iterrows():
        if word in row['CleanQuestion'].split():
            if row['HighValue'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [26]:
random.seed(1987)
comparison_terms = list(random.sample(terms_used,10))

In [27]:
observed_expected = []

In [28]:
for i in comparison_terms:
    observed_expected.append(word_value(i))

In [29]:
low_value_count = df['HighValue'].value_counts()[0]
high_value_count = df['HighValue'].value_counts()[1]

In [30]:
chi_squared = []

In [33]:
for obs in observed_expected:
    total = sum(obs)
    total_prop = total/len(df)
    expected_high = total_prop * high_value_count
    expected_low = total_prop * low_value_count
    observed = np.array([obs[0], obs[1]])
    expected = np.array([expected_high, expected_low])
    chi_squared.append(stats.chisquare(observed, expected))

In [34]:
chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.42281054506129573, pvalue=0.515537958129453),
 Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996),
 Power_divergenceResult(statistic=0.889754963322559, pvalue=0.3455437191483469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)]