# Loading 'Winning Jeopardy' Dataset into jupyter

In [41]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.columns)
print("n\n\n")
print(jeopardy.head(5))

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')
n


   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...

## Fixing jeopardy  column names

In [42]:
for col in jeopardy.columns:
    if col[0] == " ":
        jeopardy = jeopardy.rename(columns = {col:col[1:]})
        
print(jeopardy.columns)

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


## Normalizing Questions and Answers (All to lowercase, Remove punctuation)

In [43]:
import re

def normalize(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]","",text)
    return text
    
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
print(jeopardy["clean_question"].head(5))

jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)
print(jeopardy["clean_answer"].head(5))
    

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object
0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object


## Similarly Normalizing values

In [44]:

def norm_val(text):        
    text = re.sub("[^A-Za-z0-9\s]","",text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text
        
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_val)
print(jeopardy["clean_value"].head(5))
    

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64


## Converting to python Datetime

In [45]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
print(jeopardy["Air Date"].head(5))


0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]


## Analyzing Questions and Answers

In [46]:

def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)
print(jeopardy["answer_in_question"].head(5))

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: answer_in_question, dtype: float64


In [47]:
jeopardy["answer_in_question"].mean()

0.060493257069335872

## Analyzing terms used in Questions

In [48]:
question_overlap = []
terms_used = set()
for index,row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    for i in split_question:
        if len(i) < 6:
            split_question.remove(i)
    match_count = 0
    for j in split_question:
        if j in terms_used:
            match_count += 1
    for j in split_question:
        terms_used.add(j)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
    
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()
            




0.80278906589148624

## Classifying High Value Questions

In [49]:
def valuation(row):
    if row["clean_value"] > 800:
        return 1
    else:
        return 0
jeopardy["high_value"] = jeopardy.apply(valuation,axis=1)
print(jeopardy["high_value"].value_counts())

0    14265
1     5734
Name: high_value, dtype: int64


In [56]:
observed_expected = []
def counter(word):
    low_count = 0
    high_count = 0
    for i,row in jeopardy.iterrows():
        words = row["clean_question"].split(" ")
        if word in words:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count,low_count

comparison_terms = list(terms_used)[1:6]
for val in comparison_terms:
    observed_expected.append(counter(val))
    
print(observed_expected)
print(comparison_terms[0:])
        
        

[(0, 1), (0, 1), (1, 1), (0, 1), (0, 1)]
['profiles', 'choirboy', 'diabetics', 'seltzer', 'kaskaskia']


## Chi-Squared Test

In [63]:
from scipy.stats import chisquare
import numpy as np

high_value_count = 0
low_value_count = 0
for i,val in jeopardy.iterrows():
    if val["high_value"] == 1:
        high_value_count += 1
    if val["high_value"] == 0:
        low_value_count += 1

chi_squared = []
total = 0
for i in observed_expected:
    total += i[0] + i[1]
    total_prop = total / jeopardy.shape[0]
    exp_high = total_prop * high_value_count
    exp_low = total_prop * low_value_count
    
    observed = np.array([i[0],i[1]])
    expected = np.array([exp_high,exp_low])
    chi_squared.append(chisquare(observed, expected))
    
print(chi_squared)



[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686), Power_divergenceResult(statistic=0.70098142306344202, pvalue=0.40245411826570177), Power_divergenceResult(statistic=1.2224387408306399, pvalue=0.26888257861514109), Power_divergenceResult(statistic=3.280392569225377, pvalue=0.070112156757071512), Power_divergenceResult(statistic=4.2336604743544815, pvalue=0.039629884067186996)]
