In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import scipy.stats
%matplotlib inline

In [2]:
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
real_cols = []
for col in jeopardy.columns:
    m = re.search(r'\S.+',col)
    if m:
        real_cols.append(m.group(0))
        print(m.group(0))
    else:
        real_cols.append(col)
jeopardy.columns = real_cols

Show Number
Air Date
Round
Category
Value
Question
Answer


In [5]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [6]:
def normalize_text(string):
    string = string.lower()
    str_list = re.split(r'\W+',string)
    normal = ' '.join(str_list)
    return normal


In [7]:
print(normalize_text('No. 2: 1912 Olympian; football sta'))

no 2 1912 olympian football sta


In [8]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)

In [9]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)

In [10]:
def norm_cash(val):
    if val != "None":
        num = re.search(r'\w+',val)
        num = num.group(0)
        num = int(num)
        return num
    else:
        return 0

In [11]:
jeopardy['clean_value'] = jeopardy['Value'].apply(norm_cash)

In [12]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [13]:
def check_q(check):
    split_answer = check['clean_answer'].split(' ')
    split_question = check['clean_question'].split(' ')
    match_count = 0
    while split_answer.count('the') > 0:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if split_question.count(word) > 0:
            match_count += 1
    return match_count / len(split_answer)

In [14]:
jeopardy['answer_in_question'] = jeopardy.apply(check_q, axis=1)

In [15]:
jeopardy['answer_in_question'].mean()

0.068723868400102209

In [16]:
question_overlap = []
terms_used = set()
jeopardy.shape

(19999, 11)

In [17]:
for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    for word in split_question:
        if len(word) < 6:
            split_question.remove(word)
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap

In [18]:
jeopardy['question_overlap'].mean()

0.83168545495871471

Very few questions are recycled, so studying old questions will likely not help very much. It would be better to sort questions by subject matter or categories and then study those categories.

In [19]:
def check_value(row):
    val = row['clean_value']
    if val > 800:
        value = 1
    else:
        value = 0
    return value

In [20]:
jeopardy['high_value'] = jeopardy.apply(check_value, axis=1)

In [31]:
splits=[]
jeopardy['split_question'] = ''
for index, row in jeopardy.iterrows():
    row['split_question'] = row['clean_question'].split(' ')
    splits.append(row['split_question'])

jeopardy['split_question'] = splits

In [32]:
print(jeopardy['split_question'])

0        [for, the, last, 8, years, of, his, life, gali...
1        [no, 2, 1912, olympian, football, star, at, ca...
2        [the, city, of, yuma, in, this, state, has, a,...
3        [in, 1963, live, on, the, art, linkletter, sho...
4        [signer, of, the, dec, of, indep, framer, of, ...
5        [in, the, title, of, an, aesop, fable, this, i...
6        [built, in, 312, b, c, to, link, rome, the, so...
7        [no, 8, 30, steals, for, the, birmingham, baro...
8        [in, the, winter, of, 1971, 72, a, record, 1, ...
9        [this, housewares, store, was, named, for, the...
10                                 [, and, away, we, go, ]
11       [cows, regurgitate, this, from, the, first, st...
12       [in, 1000, rajaraja, i, of, the, cholas, battl...
13       [no, 1, lettered, in, hoops, football, lacross...
14       [on, june, 28, 1994, the, nat, l, weather, ser...
15       [this, company, s, accutron, watch, introduced...
16       [outlaw, murdered, by, a, traitor, and, a, cow.

In [44]:
def word_count(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        if word in row['split_question']:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return low_count, high_count

In [48]:
jeopardy.shape[0]

19999

In [49]:
low_high = []
terms_used = list(terms_used)
comparison_terms = terms_used[1:6]

for term in comparison_terms:
    low_high.append(word_count(term))

In [50]:
print(comparison_terms)
print(low_high)

['promontory', 'collyer', 'wingspan', 'sydney', 'replace']
[(4, 0), (0, 1), (1, 1), (8, 5), (9, 0)]


In [41]:
low_value_count = jeopardy['high_value'].value_counts()[0]
high_value_count = jeopardy['high_value'].value_counts()[1]

In [60]:
chi_squared = []
for count_list in low_high:
    total = sum(count_list)
    total_prop = total / jeopardy.shape[0]
    expected_term_high = total_prop * high_value_count
    expected_term_low = total_prop * low_value_count
    chi_square, p = scipy.stats.chisquare(count_list[1],expected_term_high)
    chi_squared.append((chi_square,p))
    

In [61]:
print(chi_squared)

[(0.99444972248612429, nan), (2.270937450734162, nan), (0.50838737129937761, nan), (0.96720202137342481, nan), (2.2375118755937797, nan)]


Find a better way to eliminate non-informative words than just removing words that are less than 6 characters long. Some ideas:
Manually create a list of words to remove, like the, than, etc.
Find a list of stopwords to remove.
Remove words that occur in more than a certain percentage (like 5%) of questions.
Perform the chi-squared test across more terms to see what terms have larger differences. This is hard to do currently because the code is slow, but here are some ideas:
Use the apply method to make the code that calculates frequencies more efficient.
Only select terms that have high frequencies across the dataset, and ignore the others.
Look more into the Category column and see if any interesting analysis can be done with it. Some ideas:
See which categories appear the most often.
Find the probability of each category appearing in each round.
Use the whole Jeopardy dataset (available here) instead of the subset we used in this mission.
Use phrases instead of single words when seeing if there's overlap between questions. Single words don't capture the whole context of the question well.