In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import choice
from scipy.stats import chisquare

%matplotlib inline

In [13]:
jeopardy = pd.read_csv('jeopardy.csv')

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [14]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [15]:
newColumns = list()
for column in jeopardy.columns:
    column = column.lower().strip().replace(' ','_')
    newColumns.append(column)

jeopardy.columns = newColumns

In [16]:
import re

def text_cleaning(text):
    text = text.lower()
    text = re.sub('[^a-z0-9\s]', '', text)
    text = re.sub('\s+', ' ', text)
    return text

def value_cleaning(text):
    if text == 'None':
        return 0
    else:
        text = text.replace('$', '').replace(',', '')
        return int(text)

In [32]:
jeopardy['clean_question'] = jeopardy.question.apply(text_cleaning)
jeopardy['clean_answer'] = jeopardy.answer.apply(text_cleaning)
jeopardy['clean_value'] = jeopardy.value.apply(value_cleaning)

Checking if the ansewer is on the question

In [41]:
def words_match(row):
    split_answer = set(row['clean_answer'].lower().replace('the', '').split())
    split_question = set(row['clean_question'].lower().replace('the', '').split())
    intersection = list(split_answer.intersection(split_question))
    if len(split_answer) == 0:
        return 0 
    else:
        return(len(intersection)/len(split_answer))
    
jeopardy.apply(words_match, axis = 1).mean()

0.05832327691420646

In [67]:
def remove_short_words(text):
    text = text.split()
    new_text = list()
    for word in text:
        if len(word) >= 6:
            new_text.append(word)
    return new_text

In [91]:
list_question = jeopardy.clean_question.apply(remove_short_words)

In [92]:
list_question[list_question.apply(len) > 0].duplicated().mean()

0.008887072884212676

Only 0.8% of the questions are repeated 

Checking for repeated terms

In [131]:
question_overlap = list()
terms_used = dict()

for text in list_question:
    match_count = 0
    for word in text:
        if word in terms_used.keys():
            match_count += 1
            terms_used[word] += 1
        else:
            terms_used[word] = 1
    if len(text) > 0:
        match_count /= len(text)
    question_overlap.append(match_count)
            

In [132]:
sum(question_overlap)/(len(question_overlap))

0.6925960057338565

69% of the questions have text overlaping

In [133]:
pd.Series(terms_used).sort_values(ascending = False).head(20)

called              521
country             476
played              297
became              287
before              267
president           258
capital             257
american            257
famous              246
targetblankherea    244
french              243
island              216
people              184
national            183
largest             179
little              178
around              169
british             166
author              164
meaning             162
dtype: int64

We can see that have a lot of questions about american, french and brith, authors, countries, famous people, sport playes.

# Terms with high value

In [122]:
high_value = lambda value: 1 if value > 800 else 0

jeopardy['high_value'] = jeopardy.clean_value.agg(high_value)

In [127]:
def count_usage(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        split_question = row['clean_question'].split()
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [143]:
comparison_terms = list()
for i in range(10):
    comparison_terms.append(choice(list(terms_used.keys())))

In [147]:
observed_expected = list()
for term in comparison_terms:
    observed_expected.append(count_usage(term))

In [176]:
high_value_count = jeopardy.high_value.sum()
low_value_count = (jeopardy.high_value == False).sum()

In [None]:
chi_squared = list()

for values in observed_expected:
    total = sum(values)
    total_prop = total/jeopardy.shape[0]
    high_value_expected = total_prop*high_value_count
    low_value_expected = total_prop*low_value_count
    
    observed = np.array([values[0], values[1]])
    expected = np.array([high_value_expected, low_value_expected])
    chi_squared.append(chisquare(observed, expected))

14265