In [1]:
import re
import string
import numpy as np
import scipy.stats as stats
import pandas as pd

In [2]:
df = pd.read_csv('jeopardy.csv')
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
df.shape

(19999, 7)

In [4]:
df.dtypes

Show Number     int64
 Air Date      object
 Round         object
 Category      object
 Value         object
 Question      object
 Answer        object
dtype: object

In [5]:
df.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [6]:
columns = []
regex = re.compile('^\ ')
for i in df.columns:
  columns.append(regex.sub('', i))
df.columns = columns
df.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [7]:
def str_lower_strip_punc(s):
  exclude = set(string.punctuation)
  s = ''.join(ch for ch in s if ch not in exclude)
  s = s.lower()
  return(s)

In [8]:
df['clean_question']= df['Question'].apply(lambda x: str_lower_strip_punc(x))
df['clean_question'].head()

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [9]:
df['clean_answer'] = df['Answer'].apply(lambda x: str_lower_strip_punc(x))
df['clean_answer'].head()

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [10]:
def dollar_normalise(s):
  try:
    exclude = set(string.punctuation)
    s = ''.join(ch for ch in s if ch not in exclude)
    regex = re.compile('^\$')
    s = regex.sub('', s)
    return(int(s))
  except Exception:
    return 0

In [11]:
df['clean_value'] = df['Value'].apply(lambda x: dollar_normalise(x))
df['clean_value'].head()

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

In [12]:
df['Air Date'] = pd.to_datetime(df['Air Date'])
df['Air Date'].head()

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

In [13]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


In [14]:
df.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [15]:
def answer_in_question(row):
  split_question = row['clean_question'].split(' ')
  split_answer = row['clean_answer'].split(' ')
  the_count = split_answer.count('the')
  for _ in range(the_count):
    split_answer.remove('the')
  if len(split_answer) == 0:
    return 0
  match_count = 0
  for a in split_answer:
    for q in split_question:
      if a == q:
        match_count += 1
  return(match_count / len(split_answer))

In [16]:
df['answer_in_question'] = df.apply(lambda row: answer_in_question(row), axis=1)
df['answer_in_question'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: answer_in_question, dtype: float64

In [17]:
df['answer_in_question'].mean()

0.07238819400177446

In [18]:
df.sort_values('Air Date', ascending=True, inplace=True)
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.0
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.0
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.0


In [19]:
question_overlap = []
terms_used = []
for i, row in df.iterrows():
  split_question = row['clean_question'].split(' ')
  split_question = [q for q in split_question if len(q) > 5]
  match_count = 0
  for q in split_question:
    if q in terms_used:
      match_count += 1
    else: terms_used.append(q)
  if len(split_question) > 0:
    question_overlap.append(match_count / len(split_question))
  else:
    question_overlap.append(0)
df['question_overlap'] = question_overlap
df['question_overlap'].head()

19325    0.0
19301    0.0
19302    0.0
19303    0.5
19304    0.0
Name: question_overlap, dtype: float64

In [20]:
df['question_overlap'].mean()

0.6889055316620302

In [21]:
len(terms_used)

24532

In [22]:
def value_800(row):
  value = 0
  if row['clean_value'] > 800:
    value = 1
  else:
    value = 0
  return(value)

In [23]:
df['high_value'] = df.apply(lambda row: value_800(row), axis=1)
df['high_value'].head()

19325    0
19301    0
19302    0
19303    0
19304    0
Name: high_value, dtype: int64

In [24]:
def question_repeat(str):
  low_count = 0
  high_count = 0
  for i, row in df.iterrows():
    if str in row['clean_question'].split(' '):
      if row['high_value']:
        high_count += 1
      else:
        low_count += 1
  return(high_count, low_count)

In [25]:
observed_expected = []
comparison_terms = terms_used[:10]
for term in comparison_terms:
  observed_expected.append(question_repeat(term))
observed_expected

[(0, 3),
 (68, 181),
 (0, 5),
 (2, 4),
 (1, 9),
 (34, 61),
 (5, 15),
 (22, 44),
 (2, 5),
 (55, 124)]

In [26]:
terms_used[:10]

['adventurous',
 'president',
 'automobile',
 'airplane',
 'notorious',
 'leader',
 'missing',
 'washington',
 'proclaimed',
 'national']

In [27]:
high_value_count = len(df[df['high_value'] == 1])
high_value_count

5734

In [28]:
low_value_count = len(df[df['high_value'] == 0])
low_value_count

14265

In [29]:
chi_squared = []
for e in observed_expected:
  total = sum(e)
  total_prob = total / df.shape[0]
  expected_high_count = total_prob * high_value_count
  expected_low_count = total_prob * low_value_count
  expected = np.array([expected_high_count, expected_low_count])
  observed = np.array([e[0], e[1]])
  print(observed, expected)
  chi_squared.append(stats.chisquare(e, expected))

[0 3] [ 0.86014301  2.13985699]
[ 68 181] [  71.39186959  177.60813041]
[0 5] [ 1.43357168  3.56642832]
[2 4] [ 1.72028601  4.27971399]
[1 9] [ 2.86714336  7.13285664]
[34 61] [ 27.23786189  67.76213811]
[ 5 15] [  5.73428671  14.26571329]
[22 44] [ 18.92314616  47.07685384]
[2 5] [ 2.00700035  4.99299965]
[ 55 124] [  51.32186609  127.67813391]


In [30]:
chi_squared

[Power_divergenceResult(statistic=1.2058885383806519, pvalue=0.27214791766902047),
 Power_divergenceResult(statistic=0.22592591114717697, pvalue=0.63456129826261032),
 Power_divergenceResult(statistic=2.00981423063442, pvalue=0.1562844540498966),
 Power_divergenceResult(statistic=0.063762334468807247, pvalue=0.80064530268787815),
 Power_divergenceResult(statistic=1.7046782653473278, pvalue=0.19167729675916911),
 Power_divergenceResult(statistic=2.3535940833298481, pvalue=0.1249945297234204),
 Power_divergenceResult(statistic=0.13182216542203845, pvalue=0.71655018652851865),
 Power_divergenceResult(statistic=0.7013856791568791, pvalue=0.40231847734946213),
 Power_divergenceResult(statistic=3.4231707828461518e-05, pvalue=0.99533177406483708),
 Power_divergenceResult(statistic=0.36956355622281933, pvalue=0.54324226353126892)]