In [4]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')
print(jeopardy.head())
print(jeopardy.columns)
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']    

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [5]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text

def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [6]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

In [7]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [14]:
def find_answer_pattern(series):
    split_answer = series['clean_answer'].split(" ")
    split_question = series['clean_question'].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1 
    return match_count/len(split_answer)

answer_in_question = jeopardy.apply(find_answer_pattern, axis=1)
mean_answer = answer_in_question.mean()
print(answer_in_question)

0        0.000000
1        0.000000
2        0.000000
3        0.000000
4        0.000000
5        0.000000
6        0.000000
7        0.000000
8        0.000000
9        0.333333
10       0.000000
11       0.000000
12       0.000000
13       0.000000
14       0.500000
15       0.000000
16       0.000000
17       0.000000
18       0.000000
19       0.000000
20       0.000000
21       0.000000
22       0.000000
23       0.000000
24       0.500000
25       0.000000
26       0.000000
27       0.000000
28       0.000000
29       0.000000
           ...   
19969    0.000000
19970    0.000000
19971    0.000000
19972    0.000000
19973    0.000000
19974    0.333333
19975    0.000000
19976    0.000000
19977    0.000000
19978    0.000000
19979    0.000000
19980    0.500000
19981    0.500000
19982    0.000000
19983    0.000000
19984    0.000000
19985    0.000000
19986    0.000000
19987    0.000000
19988    0.000000
19989    0.000000
19990    0.000000
19991    0.000000
19992    0.000000
19993    0

In [16]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for q in split_question:
        if q in terms_used:
            match_count +=1
    for q in split_question:
        terms_used.add(q)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())


0.690873731567


In [20]:
def calc_value(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value =0
    return value

jeopardy['high_value'] = jeopardy.apply(calc_value, axis=1)
print(high_value)

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       1
23       0
24       1
25       1
26       1
27       1
28       1
29       0
        ..
19969    1
19970    1
19971    1
19972    1
19973    1
19974    1
19975    1
19976    1
19977    1
19978    1
19979    1
19980    1
19981    1
19982    1
19983    1
19984    1
19985    1
19986    1
19987    0
19988    0
19989    0
19990    0
19991    0
19992    0
19993    0
19994    0
19995    0
19996    0
19997    0
19998    0
dtype: int64


In [24]:
def calc_freq(word):
    low_count =0
    high_count =0
    for i,row in jeopardy.iterrows():
        split_question = row['clean_question'].split(" ")
        for w in split_question:
            if w == word:
                if row['high_value'] == 1:
                    high_count +=1
                else:
                    low_count +=1
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5]
for t in comparison_terms:
    observed_expected.append(calc_freq(t))

print(observed_expected)
    

[(1, 2), (1, 0), (0, 1), (0, 1), (0, 1)]


In [25]:
from scipy.stats import chisquare
import numpy as np
high_value_count = jeopardy[jeopardy['high_value'] == 1]
low_value_count = jeopardy[jeopardy['high_value']==0]
chi_squared = []
for l in observed_expected:
    total = l[0]+ l[1]
    total_prop = total/jeopardy.shape[0]
    exp_h_c = total_prop*high_value_count
    exp_l_c = total_prop*low_value_count
    observed = np.array([l[0],l[1]])
    expected = np.array([exp_h_c, exp_l_c])
    chi_squared.append(chisquare(observed, expected))

print(chi_squared)

TypeError: Could not operate 0.00015000750037501875 with block values can't multiply sequence by non-int of type 'float'