This analysis will explore the jeopardy dataset.

# Load files

In [1]:
import pandas as pd 

jeopardy = pd.read_csv("data/jeopardy.csv")

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


# Clean columns

In [2]:
# Cleans column spacing 

new_col_names = [] 
for i in jeopardy.columns.values: 
    new_col_names.append(i.strip())


In [3]:
jeopardy.columns = new_col_names

# Clean Q&A values 

In [4]:
# Normalizes question & answer text 
import re 

def clean_answer(text): 
    new_text = text.lower()
    new_text = re.sub(r'[^\w\s]','',new_text)
    return new_text 

jeopardy['clean_question'] = jeopardy['Question'].apply(clean_answer)

In [5]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(clean_answer)

# Clean Value and Air Date values

In [6]:
def clean_dollars(text): 
    new_text = re.sub('\$', '', text)
    try: 
        new_text = int(new_text)
    except ValueError: 
        new_text = 0 
    return new_text

jeopardy['clean_value'] = jeopardy['Value'].apply(clean_dollars)

In [7]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

# Analyze words in Q&A

In [8]:
# calculates how many words from the answer are in the question 

def compare_q_a(row): 
    split_answer = row['clean_answer'].split(" ")
    split_question = row['clean_question'].split(" ")
    
    match_count = 0 
    if "the" in split_answer: 
        split_answer.remove("the")
    if len(split_answer) == 0: 
        return 0
    
    for word in split_answer: 
        if word in split_question:
            match_count += 1 
    return (match_count / len(split_answer))

#applies the function to all rows 
jeopardy['answer_in_question'] = jeopardy.apply(compare_q_a, axis=1)

jeopardy['answer_in_question'].mean()
    
    

0.060493257069335914

With a mean of .06 for answer_in_question, it would not be wise for a person to assume that the answer is in the question.

# Investigate if new questions repeats of older ones

In [9]:
# create a term used 

jeopardy.sort_values('Air Date', ascending=False)

question_overlap = [] 
terms_used = set()

for indx, row in jeopardy.iterrows():
    split_question = row['clean_question'].split()
    split_questions = [word for word in split_question if len(word) > 6]
    match_count = 0
    
    for word in split_question: 
        if word in terms_used: 
            match_count += 1 
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
        question_overlap.append(match_count)


        

In [10]:
jeopardy['question_overlap'] = question_overlap 

jeopardy['question_overlap'].mean()

0.8684200779201654

There are lots of times when questions are being recycled, which might mean that studying previous questions would be a great way to study.

# Investigate high/low value terms using chi_squared

In [11]:
# determines which questions are high and low 

def check_high_low(row): 
    value = None 
    if row['clean_value'] > 800: 
        value = 1
        return value 
    else: 
        value = 0 
        return value 
        
jeopardy['high_value'] = jeopardy.apply(check_high_low, axis=1)

In [None]:
# assign words to low and high values 

def high_low_word(word): 
    low_count = 0 
    high_count = 0
    for indx, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question: 
            if row['high_value'] == 1: 
                high_count += 1 
            else: 
                low_count += 1 
    return high_count, low_count

observed_expected = []

In [None]:
comparison_terms = list(terms_used)[0:5]

for word in comparison_terms: 
    observed_expected.append(high_low_word(word))

In [None]:
observed_expected

In [None]:
from scipy.stats import chisquare
import numpy as np
    
high_value_count = sum(jeopardy.high_value == 1) 
low_value_count = sum(jeopardy.high_value == 0)

chi_squared = []

for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))


In [None]:
chi_squared

# Chi-squared results

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies