In [1]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head())

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [3]:
jeopardy.columns = ['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer']

In [4]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
# Normalize the question and answer columns
import re
def normalize_text(text):
    text = text.lower()
    #use regex to remove all punctuation
    text_no_punc = re.sub("[^A-Za-z0-9]"," ",text)
    return text_no_punc

In [6]:
jeopardy["clean_question"] = jeopardy[" Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy[" Answer"].apply(normalize_text)
print(jeopardy["clean_answer"].head())
print(jeopardy["clean_question"].head())
jeopardy.head()

0    copernicus
1    jim thorpe
2       arizona
3    mcdonald s
4    john adams
Name: clean_answer, dtype: object
0    for the last 8 years of his life  galileo was ...
1    no  2  1912 olympian  football star at carlisl...
2    the city of yuma in this state has a record av...
3    in 1963  live on  the art linkletter show   th...
4    signer of the dec  of indep   framer of the co...
Name: clean_question, dtype: object


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was ...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisl...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show th...,mcdonald s
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the co...,john adams


In [7]:
print(jeopardy[" Value"].head())

0    $200
1    $200
2    $200
3    $200
4    $200
Name:  Value, dtype: object


In [8]:
#Remove $ from value column and convert all the vaues to int
import re
def normalize_value(text):
    clean_text = re.sub("[^A-Za-z0-9]"," ",text)
    try:
        int_text = int(clean_text)
    except Exception:
        int_text = 0
    return int_text


In [9]:

jeopardy["clean_value"]= jeopardy[" Value"].apply(normalize_value)
print(jeopardy["clean_value"].head())


0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64


In [10]:
# print(jeopardy[" Air Date"].head())
#Convert the Air Date column to a datetime column using pandas.to_datetime
import pandas as pd
import numpy as np
a = pd.to_datetime(jeopardy[" Air Date"])
print(a[0])
print(a[0]+ np.timedelta64(12, 'h'))

2004-12-31 00:00:00
2004-12-31 12:00:00


In [11]:
#jeopardy.ix[2]
print(jeopardy["clean_answer"])

0                                               copernicus
1                                               jim thorpe
2                                                  arizona
3                                               mcdonald s
4                                               john adams
5                                                  the ant
6                                           the appian way
7                                           michael jordan
8                                               washington
9                                           crate   barrel
10                                          jackie gleason
11                                                 the cud
12                                   ceylon  or sri lanka 
13                                               jim brown
14                                            the uv index
15                                                  bulova
16                                             jesse jam

In [12]:
#function which looks for the same words repeated in questions
#and answers
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer)==0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return(match_count/len(split_answer))



In [13]:
jeopardy["answer_in_question"] = jeopardy.apply(count_matches,axis = 1)
print(jeopardy["answer_in_question"].mean())
#print(jeopardy["answer_in_question"])


0.0961144775685


Terms in answers appear only 9.6% times in the question.So, it cant be deduced that by merely looking at the question,answer can be guessed.

In [14]:
jeopardy.sort(" Air Date",ascending = True)
question_overlap = []
terms_used = set()
for idx,row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [x for x in row["clean_question"] if len(x) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            match_count/ len(split_question)
            
    print(type(row))
    if idx > 5:
        break
        
print(terms_used)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
set()


  if __name__ == '__main__':


In [15]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        split_question = [q for q in split_question if len(q) > 5]
        match_count = 0
        for word in split_question:
            if word in terms_used:
                match_count += 1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            match_count /= len(split_question)
        question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()
                

                

0.72735994608824506

There is a 70% overlap between the terms in the new questions and old questions. It means it is worth looking into recycling of questions.

In [16]:
print(jeopardy[" Value"].head())

0    $200
1    $200
2    $200
3    $200
4    $200
Name:  Value, dtype: object


In [17]:
print(jeopardy["clean_value"].head(20))

0     200
1     200
2     200
3     200
4     200
5     200
6     400
7     400
8     400
9     400
10    400
11    400
12    600
13    600
14    600
15    600
16    600
17    600
18    800
19    800
Name: clean_value, dtype: int64


In [24]:
#The function categorizes the values as high_value if value is higher
#than $800 and low_value if it's below $800
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value
jeopardy["high_value"] = jeopardy.apply(determine_value,axis = 1)
print(jeopardy.columns)
print(jeopardy.head())
print(jeopardy["high_value"].head(40))

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer', 'clean_question', 'clean_answer', 'clean_value',
       'answer_in_question', 'question_overlap', 'high_value'],
      dtype='object')
   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  \
0  For the last 8 years of his life, Galileo was ...  Copernicus   
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe   
2  The city of Yuma in this state has a record av...     Arizona   
3  In 1

In [28]:
def count_usage(word):
    low_count = 0
    high_count = 0
    for i,row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return(high_count,low_count)

observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[:5]
#print(comparison_terms)
for x in comparison_terms:
    observed_expected.append(count_usage(x))
    
print(observed_expected)
        
    

[(0, 1), (0, 1), (0, 1), (1, 0), (0, 1)]


In [26]:
#find the number of low value questions the word occurs in
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

#Number of high value questions the word occured in
high_value_count = jeopardy[jeopardy["high_value"]== 1].shape[0]
print(jeopardy.shape[0])




19999


In [32]:
from scipy.stats import chisquare
chi_squared = []
for word_occurences in observed_expected:
    total_occurences = sum(word_occurences)
#percentage of questions the word occurs in     
    total_proportion = total_occurences/jeopardy.shape[0]
# expected term count for low value rows
    low_value_expected = total_proportion * low_value_count
# expected term count for high value rows 
    high_value_expected = total_proportion * high_value_count
    observed = np.array([word_occurences[0],word_occurences[1]])
    expected = np.array([high_value_expected,low_value_expected])
    chi_squared.append(chisquare(observed,expected))
    
chi_squared
    
    


[Power_divergenceResult(statistic=0.33087109868902648, pvalue=0.56514660326737798),
 Power_divergenceResult(statistic=0.33087109868902648, pvalue=0.56514660326737798),
 Power_divergenceResult(statistic=0.33087109868902648, pvalue=0.56514660326737798),
 Power_divergenceResult(statistic=3.022325020112631, pvalue=0.08212564786568953),
 Power_divergenceResult(statistic=0.33087109868902648, pvalue=0.56514660326737798)]

Since p-value > .05 in this case,the results are not statistically significant.
The alternative hyposthesis is not accepted which means that the observed and expected values are from the same population