## Exploratory Data Analysis of Jeopardy

In [95]:
#import libraries and set displays

import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)



In [109]:
#import Jeopardy data
jeopardy_data = pd.read_csv("jeopardy.csv")

display(jeopardy_data.head())

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,"No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves",Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,"The city of Yuma in this state has a record average of 4,055 hours of sunshine each year",Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", this company served its billionth burger",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States",John Adams


### Clean-up the column names

In [97]:
list(jeopardy_data.columns)

['Show Number',
 ' Air Date',
 ' Round',
 ' Category',
 ' Value',
 ' Question',
 ' Answer']

In [110]:
jeopardy_data.columns = jeopardy_data.columns.str.strip()
jeopardy_data.columns = jeopardy_data.columns.str.replace(" ","_")
list(jeopardy_data.columns)

['Show_Number', 'Air_Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

## Define a function to filter for Jeopardy questions containing specific words

In [99]:
#function searches a provided column in a dataset for a list of words - it must contain all the words in the list
#function is not sensitive to capitalization and the word must not be a substring of another word

def filter_questions(mylist, mycolumn, mydf):
    return mydf[mycolumn.apply(lambda x: all(" "+ element.lower()+" " in x.lower() for element in mylist))]
    

In [100]:
# test the function with a list of words - "King" and "England"
filter_questions(["King","England"], jeopardy_data.Question, jeopardy_data)

Unnamed: 0,Show_Number,Air_Date,Round,Category,Value,Question,Answer
6337,3517,1999-12-14,Double Jeopardy!,Y1K,$800,"In retaliation for Viking raids, this ""Unready"" king of England attacks Norse areas of the Isle of Man",Ethelred
9191,3907,2001-09-04,Double Jeopardy!,WON THE BATTLE,$800,This king of England beat the odds to trounce the French in the 1415 Battle of Agincourt,Henry V
13454,4726,2005-03-07,Jeopardy!,A NUMBER FROM 1 TO 10,$1000,It's the number that followed the last king of England named William,4
18076,3227,1998-09-22,Double Jeopardy!,WORLD HISTORY,$1000,In 1199 this crusader king of England was mortally wounded while besieging the castle of Chalus,Richard the Lionhearted
19168,3109,1998-02-19,Jeopardy!,HISTORIC WORLD LEADERS,$300,"He was the only king of England to have ""The Great"" tacked on to his name",Alfred
21511,4650,2004-11-19,Jeopardy!,"THE ""O.C.""",$1000,this man and his son ruled England following the execution of King Charles I,Oliver Cromwell
23810,4862,2005-11-01,Jeopardy!,NAME THE YEAR,$400,William the Conqueror was crowned King of England in Westminster Abbey on Christmas Day in this year,1066
23979,4664,2004-12-09,Double Jeopardy!,MEDIEVAL TIMES,$2000,"This ""unready"" king of England lost most of his country to Sven Forkbeard, the king of Denmark",Aethelred the Unready
26780,2118,1993-11-17,Double Jeopardy!,THE MIDDLE AGES,"$1,200",This king of England was killed by a Norman arrow at the Battle of Hastings,Harold II
33174,1333,1990-05-23,Jeopardy!,THE CRUSADES,$200,This king of England was a leader of the Third Crusade,Richard I (Richard the Lionhearted)


## What is the value of questions that contain specific words?

In [112]:
# Update the value column to be a float
# First, remove non-numeric characters from the data set
# Set values with no numbers to null

jeopardy_data["Value"] = jeopardy_data["Value"].apply(lambda x: x.replace("$","").replace(",","").replace("None",""))
jeopardy_data.Value = pd.to_numeric(jeopardy_data.Value, errors = 'coerce')

In [126]:
#jeopardy_data[jeopardy_data["Value"] == ""].Value.appy(lamda x: )


In [130]:
jeopardy_data[jeopardy_data.Value.isna()]

Unnamed: 0,Show_Number,Air_Date,Round,Category,Value,Question,Answer
55,4680,2004-12-31,Final Jeopardy!,THE SOLAR SYSTEM,,Objects that pass closer to the sun than Mercury have been named for this mythological figure,Icarus
116,5957,2010-07-06,Final Jeopardy!,HISTORIC WOMEN,,"She was born in Virginia around 1596 & died in Kent, England in 1617",Pocahontas
174,3751,2000-12-18,Final Jeopardy!,SPORTS LEGENDS,,"If Joe DiMaggio's hitting streak had gone one more game in 1941, this company would have given him a $10,000 contract",H.J. Heinz (Heinz 57 Varieties)
235,3673,2000-07-19,Final Jeopardy!,THE MAP OF EUROPE,,"Bordering Italy, Austria, Hungary & Croatia, it's one of the world's newest independent countries",Slovenia
296,4931,2006-02-06,Final Jeopardy!,FAMOUS SHIPS,,"On December 27, 1831 it departed Plymouth, England to map the coastline of South America",the HMS Beagle
...,...,...,...,...,...,...,...
216686,3940,2001-10-19,Final Jeopardy!,MAJOR LEAGUE BASEBALL TEAM NAMES,,"This team received its name after an 1890 incident in which it ""stole"" away an important player from another team",Pittsburgh Pirates
216746,6044,2010-12-16,Final Jeopardy!,SKYSCRAPERS,,"After a construction boom fueled by oil & gas money, this capital city now has Europe's tallest building",Moscow
216807,5070,2006-09-29,Final Jeopardy!,NATIONAL CAPITALS,,"This city's website calls it ""the last divided capital in Europe""",Nicosia
216868,5195,2007-03-23,Final Jeopardy!,BESTSELLING AUTHORS,,"He had the year's bestselling novel a record 7 years in a row with 7 different titles, ending in 2000",John Grisham


In [137]:
# What is the average value of questions that contain the word "king"?

avg_king_question = filter_questions(["King"], jeopardy_data.Question, jeopardy_data).Value.mean()
print("The average value of questions containing the word \"king\" is:", round(avg_king_question, ndigits = 0))

The average value of questions containing the word "king" is: 821.0


## What is the number of unique answers to all of the questions in our dataset?

In [240]:
def unique_answers(mydf, mycolumn, mycolumn_counter):
    return mydf.groupby(by = mydf[mycolumn], as_index = True)[mycolumn_counter].count().reset_index().sort_values(mycolumn_counter, ascending = False)

In [242]:
unique_answers(filter_questions(["King","England"], jeopardy_data.Question, jeopardy_data), "Answer","Question")

Unnamed: 0,Answer,Question
42,William the Conqueror,3
35,Richard the Lionhearted,3
20,Henry VIII,2
34,Richard the Lionheart,2
24,King Edward VIII,2
3,Alfred,2
39,William II,1
30,Richard Branson,1
41,William of Orange roughy,1
25,King Hussein,1


## Do we see more questions with the word "computer" over time?

In [244]:
#Update the Air_Date column to a date-time data type

jeopardy_data["Air_Date"] = pd.to_datetime(jeopardy_data["Air_Date"])

In [250]:
#Add a year column to investigate trends in questions over the years
jeopardy_data["Air_Year"] = pd.DatetimeIndex(jeopardy_data["Air_Date"]).year

In [262]:
# Determine the total number of questions asked per year

total_questions_by_year = jeopardy_data.groupby(["Air_Year"],as_index = False)["Question"].count()
total_questions_by_year.rename(columns = {"Question": "Total_number_of_questions"}, inplace = True)

In [266]:
# Search for questions with the word "computer"

computer_subset_df = filter_questions(["computer"], jeopardy_data.Question, jeopardy_data)
questions_with_comp_by_year = computer_subset_df.groupby(["Air_Year"],as_index = False)["Question"].count()
questions_with_comp_by_year.rename(columns = {"Question": "Total_number_of_questions_with_computer"}, inplace = True)

total_questions_by_year = total_questions_by_year.merge(questions_with_comp_by_year, how = "inner", on = "Air_Year", suffixes = ("_total","_withcomputer"))
total_questions_by_year["perc_questions_with_computers"] = round(total_questions_by_year["Total_number_of_questions_with_computer"]/total_questions_by_year["Total_number_of_questions"]*100,2)

In [281]:
total_questions_by_year

Unnamed: 0,Air_Year,Total_number_of_questions,Total_number_of_questions_with_computer,perc_questions_with_computers
0,1984,1179,1,0.08
1,1986,1409,1,0.07
2,1987,1275,1,0.08
3,1989,2067,1,0.05
4,1990,4337,2,0.05
5,1991,1444,1,0.07
6,1993,2132,1,0.05
7,1995,1138,2,0.18
8,1996,4891,3,0.06
9,1997,13099,14,0.11


## Create an at-home Jeopardy Game with real questions!

In [342]:
## Define function to get a random question

def lets_play(df):
    my_question = jeopardy_data.sample(n=1)
    my_answer = input("The question:" + str(my_question["Question"].to_list()[0]) + " \nMy Answer:")
    if my_answer.lower() ==  str(my_question["Answer"].to_list()[0]).lower():
        return print("You got it!")
    else:
        return print("Sorry, the correct answer is:" + str(my_question["Answer"].to_list()[0]))
    

In [343]:
lets_play(jeopardy_data)

The question:In 1879 the discovery of this artificial sweetener was announced 
My Answer: saccharin


You got it!
