In [24]:
import pandas
import csv

jeopardy = pandas.read_csv("jeopardy.csv")


In [25]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [26]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text)     
    return text

def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [27]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

In [28]:
print(jeopardy["clean_value"].head())

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64


In [29]:
jeopardy["Air Date"] = pandas.to_datetime(jeopardy["Air Date"])
jeopardy.dtypes
print(jeopardy['clean_answer'].head(10))

0        copernicus
1        jim thorpe
2           arizona
3         mcdonalds
4        john adams
5           the ant
6    the appian way
7    michael jordan
8        washington
9     crate  barrel
Name: clean_answer, dtype: object


In [30]:
def answq(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count = match_count + 1
    return match_count / len(split_answer)
answer_in_question = jeopardy.apply(answq, axis=1)
#print(answer_in_question)
print(answer_in_question.mean())  
print(jeopardy['clean_question'][8])
print(jeopardy['clean_answer'][8])


0.0604932570693
in the winter of 197172 a record 1122 inches of snow fell at rainier paradise ranger station in this state
washington


just 6% of the times a word in a question is found in the answer, given an indication that if a contestant wants to get good results it is better to study all sort of questions.

In [71]:
 question_overlap = []
terms_used = set()
for index,row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [val for val in split_question if len(val) > 5]
    match_count = 0
    for w in split_question:
        if w in terms_used:
            match_count = match_count + 1
    for w in split_question:    
        terms_used.add(w)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()
#print(jeopardy["question_overlap"])
print(terms_used)




Even though the mean is high, the results do not imply that the questions formulated were repeated quite often.  First the sample size is small and only represents a fraction of all the questions formulated along many years of the show being on air.  Second the terms used represent all unique words used to formulate questions and some of those words could have been used repetitively without being related to the same answer at all.   However it is not a bad idea to review as many questions as possible from previous shows as there is a chance that some will be formulated again.

In [32]:
lowval = jeopardy[jeopardy['clean_value'] < 800]
highval = jeopardy[jeopardy['clean_value'] > 800]
lowq = lowval["clean_question"]
highq = lowval['clean_question']
print(lowq.head())

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object


In [33]:
def valu(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value
jeopardy["high_value"] = jeopardy.apply(valu, axis = 1)
print(jeopardy['high_value'].head())
#print(jeopardy["clean_value"].head())


0    0
1    0
2    0
3    0
4    0
Name: high_value, dtype: int64


In [34]:
def wrd(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        splitq = row['clean_question'].split(' ')
        if word in splitq:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count,low_count
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[0:5]
print(comparison_terms)
for w in comparison_terms:
    observed_expected.append(wrd(w))
print(observed_expected)



['benelux', 'secede', 'dutchmans', 'rapperturnedactor', 'choice']
[(0, 2), (0, 2), (1, 0), (0, 1), (8, 11)]


In [35]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for val in observed_expected:
    tval = sum(val)
    pval = tval / jeopardy.shape[0]
    hivalexp = pval * high_value_count
    lowvalexp = pval * low_value_count
    observed = np.array([val[0],val[1]])
    expected = np.array([hivalexp,lowvalexp])
    chi_squared.append(chisquare(observed,expected))
chi_squared




[Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=1.6766420990461135, pvalue=0.19537123671498813)]

In [36]:
print(len(terms_used))

24470


p-values and chi_square values were low, indicating the there is no much difference between expected and observed results.  As it is a small sample, this result does not mean that there is no a relationship between high value questions and words used in those questions.

In [37]:
mostcat = jeopardy['Category']
category = {}
for row in mostcat:
    if row not in category:
        category[row] = 1
    else:
        category[row] += 1
        maximum = max(category, key=category.get)  
category = sorted(category.items(), key=lambda x: -x[1])
print(len(category))
print(category)
maxcat = maximum,category[maximum]
print(maxcat)

3581
[('TELEVISION', 51), ('U.S. GEOGRAPHY', 50), ('LITERATURE', 45), ('AMERICAN HISTORY', 40), ('BEFORE & AFTER', 40), ('HISTORY', 40), ('AUTHORS', 39), ('WORD ORIGINS', 38), ('WORLD CAPITALS', 37), ('SPORTS', 36), ('BODIES OF WATER', 36), ('SCIENCE & NATURE', 35), ('MAGAZINES', 35), ('RHYME TIME', 35), ('SCIENCE', 35), ('WORLD GEOGRAPHY', 33), ('HISTORIC NAMES', 32), ('ANNUAL EVENTS', 32), ('WORLD HISTORY', 32), ('BIRDS', 31), ('IN THE DICTIONARY', 31), ('FICTIONAL CHARACTERS', 31), ('MEDICINE', 30), ('ISLANDS', 30), ('POTPOURRI', 30), ('U.S. PRESIDENTS', 30), ('OPERA', 30), ('TRAVEL & TOURISM', 30), ('BALLET', 29), ('ART', 28), ('ORGANIZATIONS', 27), ('NONFICTION', 27), ('STATE CAPITALS', 27), ('U.S. CITIES', 26), ('PEOPLE', 26), ('BUSINESS & INDUSTRY', 26), ('THE MOVIES', 26), ('BRAND NAMES', 26), ('THE AMERICAN REVOLUTION', 25), ('ANATOMY', 25), ('MAMMALS', 25), ('HOLIDAYS & OBSERVANCES', 25), ('MUSEUMS', 25), ('MUSICAL THEATRE', 25), ('SAINTS', 25), ('ART & ARTISTS', 25), ('FOOD 

TypeError: list indices must be integers, not str

In [38]:
catround = pandas.crosstab(jeopardy['Round'],jeopardy['Category'])
print(catround)

Category          "A" IN SCIENCE  "A" PLUS  "A" SCIENCE CATEGORY  \
Round                                                              
Double Jeopardy!               5         5                     4   
Final Jeopardy!                0         0                     0   
Jeopardy!                      0         0                     0   
Tiebreaker                     0         0                     0   

Category          "A"NCIENT GREEKS  "AA"  "AD"JECTIVES  \
Round                                                    
Double Jeopardy!                 5     5             5   
Final Jeopardy!                  0     0             0   
Jeopardy!                        0     0             0   
Tiebreaker                       0     0             0   

Category          "AE"-NCIENT CROSSWORD CLUES  "AI"  "ANT" INFESTATION  \
Round                                                                    
Double Jeopardy!                            5     5                  5   
Final Jeopardy!     

In [39]:
catround = (jeopardy[['Round','Category']])
print(catround.head())
tieb = catround[catround['Round'] == "Final Jeopardy!"]
print(tieb)
 
        

       Round                         Category
0  Jeopardy!                          HISTORY
1  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES
2  Jeopardy!      EVERYBODY TALKS ABOUT IT...
3  Jeopardy!                 THE COMPANY LINE
4  Jeopardy!              EPITAPHS & TRIBUTES
                 Round                           Category
55     Final Jeopardy!                   THE SOLAR SYSTEM
116    Final Jeopardy!                     HISTORIC WOMEN
174    Final Jeopardy!                     SPORTS LEGENDS
235    Final Jeopardy!                  THE MAP OF EUROPE
296    Final Jeopardy!                       FAMOUS SHIPS
357    Final Jeopardy!                   EUROPEAN HISTORY
418    Final Jeopardy!                     BRITISH NOVELS
474    Final Jeopardy!                 FLAGS OF THE WORLD
535    Final Jeopardy!                THE BRITISH THEATRE
594    Final Jeopardy!                   THE CONSTITUTION
655    Final Jeopardy!                 GEOGRAPHIC PHRASES
716    Final Jeopardy!      

In [40]:
rdic = {}
rnd = jeopardy['Round']
for row in rnd:
    if row not in rdic:
        rdic[row] = 1
    else:
        rdic[row] += 1
rdic = sorted(rdic.items(), key=lambda x: -x[1])
print(rdic)
rdlst = []
for row in rnd:
    if row not in rdlst:
        rdlst.append(row)
print(rdlst)
print(rdic[0][1])        
        

[('Jeopardy!', 9901), ('Double Jeopardy!', 9762), ('Final Jeopardy!', 335), ('Tiebreaker', 1)]
['Jeopardy!', 'Double Jeopardy!', 'Final Jeopardy!', 'Tiebreaker']
9901


In [41]:
lj = []
for index,row in catround.iterrows():
    for r in rdlst:
        if row['Round'] == r:
            lj.append((r,row["Category"]))
print(lj[0:1])
t = {}
for l in lj:
    if l not in t:
        t[l] = 1
    else:
        t[l] += 1
t = sorted(t.items(), key=lambda x: -x[1])
t = t[0:20]
print(t)

[('Jeopardy!', 'HISTORY')]
[(('Double Jeopardy!', 'LITERATURE'), 35), (('Jeopardy!', 'TELEVISION'), 35), (('Double Jeopardy!', 'SCIENCE & NATURE'), 30), (('Double Jeopardy!', 'ISLANDS'), 30), (('Double Jeopardy!', 'IN THE DICTIONARY'), 30), (('Double Jeopardy!', 'BEFORE & AFTER'), 30), (('Double Jeopardy!', 'U.S. GEOGRAPHY'), 28), (('Jeopardy!', 'SPORTS'), 26), (('Jeopardy!', 'U.S. CITIES'), 25), (('Double Jeopardy!', 'HISTORIC NAMES'), 25), (('Double Jeopardy!', 'SCIENCE'), 25), (('Double Jeopardy!', 'OPERA'), 25), (('Double Jeopardy!', 'AMERICAN HISTORY'), 25), (('Jeopardy!', 'FOOD FACTS'), 25), (('Double Jeopardy!', 'WORD ORIGINS'), 25), (('Double Jeopardy!', 'WORLD GEOGRAPHY'), 25), (('Double Jeopardy!', 'WORLD CAPITALS'), 25), (('Jeopardy!', 'RHYME TIME'), 25), (('Jeopardy!', 'BIRDS'), 23), (('Double Jeopardy!', 'ART'), 23)]


In [42]:
dprob = {}
for i in t:
    for w in rdic:
        if w[0] in i[0]:
            dprob[i[0]] = i[1]/w[1]
dprob = sorted(dprob.items(), key=lambda x: -x[1])
print(dprob)            
        

[(('Double Jeopardy!', 'LITERATURE'), 0.0035853308748207335), (('Jeopardy!', 'TELEVISION'), 0.003534996465003535), (('Double Jeopardy!', 'IN THE DICTIONARY'), 0.003073140749846343), (('Double Jeopardy!', 'ISLANDS'), 0.003073140749846343), (('Double Jeopardy!', 'SCIENCE & NATURE'), 0.003073140749846343), (('Double Jeopardy!', 'BEFORE & AFTER'), 0.003073140749846343), (('Double Jeopardy!', 'U.S. GEOGRAPHY'), 0.002868264699856587), (('Jeopardy!', 'SPORTS'), 0.002625997374002626), (('Double Jeopardy!', 'WORLD CAPITALS'), 0.0025609506248719526), (('Double Jeopardy!', 'HISTORIC NAMES'), 0.0025609506248719526), (('Double Jeopardy!', 'OPERA'), 0.0025609506248719526), (('Double Jeopardy!', 'WORLD GEOGRAPHY'), 0.0025609506248719526), (('Double Jeopardy!', 'SCIENCE'), 0.0025609506248719526), (('Double Jeopardy!', 'AMERICAN HISTORY'), 0.0025609506248719526), (('Double Jeopardy!', 'WORD ORIGINS'), 0.0025609506248719526), (('Jeopardy!', 'FOOD FACTS'), 0.002524997475002525), (('Jeopardy!', 'RHYME TIM

In [43]:
#which categories appear the most often.
jeopard_rd ={}
def rdresults(roundname):
    rdresults = {}
    for index,row in catround.iterrows():
        if row['Category'] not in rdresults:
            rdresults[row['Category']] = 0
        if row['Round'] == roundname:
            rdresults[row['Category']] += 1
    rdresults = sorted(rdresults.items(), key=lambda x: -x[1])
    return rdresults
jeopard1_rd = rdresults("Jeopardy!")
doublejeop_rd = rdresults("Double Jeopardy!")
finaljeop_rd = rdresults("Final Jeopardy!")
tiebraker_rd = rdresults("Tiebraker")

print(jeopard1_rd[0:10])
print(doublejeop_rd[0:10])
print(finaljeop_rd[0:10])
print(tiebraker_rd[0:10])
#for index,row in catround.iterrows():
#    if row['Category'] not in jeopard_rd:
#        jeopard_rd[row['Category']] = 0
#   if row['Round'] == 'Jeopardy!':
#        jeopard_rd[row['Category']] += 1
#jeopard_rd = sorted(jeopard_rd.items(), key=lambda x: -x[1])

     

[('TELEVISION', 35), ('SPORTS', 26), ('U.S. CITIES', 25), ('RHYME TIME', 25), ('FOOD FACTS', 25), ('BIRDS', 23), ('U.S. GEOGRAPHY', 22), ('NATURE', 20), ('ORGANIZATIONS', 20), ('STATE CAPITALS', 20)]
[('LITERATURE', 35), ('SCIENCE & NATURE', 30), ('ISLANDS', 30), ('BEFORE & AFTER', 30), ('IN THE DICTIONARY', 30), ('U.S. GEOGRAPHY', 28), ('WORLD CAPITALS', 25), ('HISTORIC NAMES', 25), ('WORD ORIGINS', 25), ('AMERICAN HISTORY', 25)]
[('WORD ORIGINS', 8), ('U.S. PRESIDENTS', 5), ('FAMOUS NAMES', 4), ('AUTHORS', 4), ('AMERICAN LITERATURE', 3), ('THE 50 STATES', 3), ('U.S. STATES', 3), ('FAMOUS WOMEN', 3), ('SPACE EXPLORATION', 3), ('SCIENTISTS', 3)]
[('4-LETTER CAPITALS', 0), ('HISTORIC QUOTES', 0), ('EPONYMS', 0), ('STUDYING ABROAD', 0), ('GETTING POSSESSIVE', 0), ('CAR TUNES', 0), ('COMPOSERS ON FILM', 0), ('AT HOME WITH A GOOD BOOK', 0), ('BIBLICAL PAIRS', 0), ('DO US A FLAVOR', 0)]


In [52]:
# Find the probability of each category appearing in each round. Selecting only top 20 of ea round
jeopard1_rd = jeopard1_rd[0:20]
doublejeop_rd = doublejeop_rd[0:20]
tiebraker_rd = tiebraker_rd[0:10]
finaljeop_rd = finaljeop_rd[0:10]
#print(finaljeop_rd)

#print(jeopard1_rd) 
#print('round_ttls',rdic)
#jeopard1_rd[0] = jeopard1_rd[0] - (1,)
#print(jeopard1_rd)


def prob(lst):
    lstrd = ['jeopard1_rd','doublejeop_rd','finaljeop_rd', 'tiebraker_rd']
    prob = {}
    for k,l in enumerate(lstrd):
        if l == lst:
            ind = k
        for i in lst:
            if i[0] not in prob:
                prob[i[0]] = round((i[1]/rdic[k][1])*100,2)
        prob = sorted(prob.items(), key = lambda x: -x[1])        
        return prob
prob_jeopardrd = prob(jeopard1_rd)
print('Probabilities(%) per category for Jeopardy question',prob_jeopardrd)
prob_doubjeopardrd = prob(doublejeop_rd)
print('Probabilities(%) per category for Double Jeopardy question',prob_doubjeopardrd)
prob_tiebreaker = prob(tiebraker_rd)
print('Probabilities(%) per category for Tiebreak question',prob_tiebreaker)
prob_finaljeop = prob(finaljeop_rd) 
print("Probabilities(%) per category for Final Jeopardy question",prob_finaljeop)

Probabilities(%) per category for Jeopardy question [('TELEVISION', 0.35), ('SPORTS', 0.26), ('RHYME TIME', 0.25), ('FOOD FACTS', 0.25), ('U.S. CITIES', 0.25), ('BIRDS', 0.23), ('U.S. GEOGRAPHY', 0.22), ('BODIES OF WATER', 0.2), ('MUSEUMS', 0.2), ('NATURE', 0.2), ('COMMON BONDS', 0.2), ('STUPID ANSWERS', 0.2), ('MAMMALS', 0.2), ('STATE CAPITALS', 0.2), ('BUSINESS & INDUSTRY', 0.2), ('BRAND NAMES', 0.2), ('HISTORY', 0.2), ('TRAVEL & TOURISM', 0.2), ('POTPOURRI', 0.2), ('ORGANIZATIONS', 0.2)]
Probabilities(%) per category for Double Jeopardy question [('LITERATURE', 0.35), ('BEFORE & AFTER', 0.3), ('ISLANDS', 0.3), ('IN THE DICTIONARY', 0.3), ('SCIENCE & NATURE', 0.3), ('U.S. GEOGRAPHY', 0.28), ('HISTORIC NAMES', 0.25), ('OPERA', 0.25), ('SCIENCE', 0.25), ('AMERICAN HISTORY', 0.25), ('WORLD GEOGRAPHY', 0.25), ('WORD ORIGINS', 0.25), ('WORLD CAPITALS', 0.25), ('ART', 0.23), ('SCIENTISTS', 0.2), ('MEDICINE', 0.2), ('MAGAZINES', 0.2), ('AUTHORS', 0.2), ('THE AMERICAN REVOLUTION', 0.2), ('WO

In [94]:
#Find a better way to eliminate non-informative words than just removing words that are less than 6 characters long
wordsanswer = jeopardy['clean_answer']
wordsques = jeopardy['clean_question']
anslst = []
for a in wordsanswer:
    split_answer = a.split(' ')
    for w in split_answer:
        anslst.append(w)
#print(anslst)        
#finding a list of all words per column
def wrdct(df):
    lst = []
    for a in df:
        split_w = a.split(' ')
        for w in split_w:
            lst.append(w)
    return lst   
answerlst = wrdct(wordsanswer)
questlst = wrdct(wordsques)


[('the', 2531), ('a', 1252), ('of', 504), ('', 314), ('john', 198), ('an', 156), ('and', 132), ('or', 126), ('new', 105), ('to', 96), ('george', 94), ('king', 80), ('william', 80), ('in', 80), ('james', 79), ('great', 75), ('paul', 73), ('i', 67), ('red', 66), ('white', 60)]
[('the', 16462), ('this', 11676), ('of', 10631), ('in', 9485), ('a', 9481), ('', 5307), ('to', 4791), ('for', 3353), ('is', 3273), ('was', 2715), ('on', 2361), ('its', 1945), ('from', 1851), ('with', 1658), ('his', 1638), ('as', 1566), ('by', 1465), ('it', 1440), ('these', 1389), ('he', 1377)]


In [None]:
#Getting frequencies per word per column
def countw(lst):
    wdic ={}
    for w in lst:
        if w not in wdic:
            wdic[w] = 1
        else:
            wdic[w] += 1
    wdic = sorted(wdic.items(), key = lambda x: -x[1])        
    return wdic       
answerword_count = countw(answerlst)
questword_count = countw(questlst)
print(answerword_count[0:20])
print(questword_count[0:20])

In [109]:
#getting the percentage per word
answer_ttl = len(answerlst)
quest_ttl = len(questlst)
print(answer_ttl,quest_ttl)
def percw(dictw):
    dict_perc = {}
    if dictw == answerword_count:
        ttl = answer_ttl
    else:
        ttl = quest_ttl
    for k,v in dictw:
        dict_perc[k] = round((v / ttl)*100,2)
    dict_perc = sorted(dict_perc.items(), key = lambda x: -x[1])
    return dict_perc
answerword_perc = percw(answerword_count)
print('Answer word percentage',answerword_perc[0:30])
questword_perc = percw(questword_count)
print('Question word percentage',questword_perc[0:30])

38183 292209
Answer word percentage [('the', 6.63), ('a', 3.28), ('of', 1.32), ('', 0.82), ('john', 0.52), ('an', 0.41), ('and', 0.35), ('or', 0.33), ('new', 0.27), ('george', 0.25), ('to', 0.25), ('james', 0.21), ('king', 0.21), ('william', 0.21), ('in', 0.21), ('great', 0.2), ('paul', 0.19), ('i', 0.18), ('red', 0.17), ('white', 0.16), ('st', 0.16), ('thomas', 0.16), ('man', 0.15), ('robert', 0.15), ('war', 0.15), ('washington', 0.14), ('on', 0.14), ('henry', 0.13), ('david', 0.13), ('little', 0.13)]
Question word percentage [('the', 5.63), ('this', 4.0), ('of', 3.64), ('in', 3.25), ('a', 3.24), ('', 1.82), ('to', 1.64), ('for', 1.15), ('is', 1.12), ('was', 0.93), ('on', 0.81), ('its', 0.67), ('from', 0.63), ('with', 0.57), ('his', 0.56), ('as', 0.54), ('by', 0.5), ('it', 0.49), ('these', 0.48), ('he', 0.47), ('that', 0.46), ('an', 0.41), ('at', 0.4), ('one', 0.39), ('name', 0.35), ('you', 0.35), ('or', 0.33), ('first', 0.32), ('are', 0.25), ('who', 0.23)]
