# This is Jeopardy!
### Author: Carlos Paiva

### Project Description
Assignment given as a practising exercise while enrolled in the Codecademy Data Scientist Path/Chapter: Data Manipulation with Pandas

### Setup and Data Exploration

In [48]:
#Importing packages
import pandas as pd
pd.set_option('display.max_colwidth', -1)

#Reading CSV file
jeopardy_dataset = pd.read_csv('jeopardy.csv')
print(jeopardy_dataset.head(10))
print(jeopardy_dataset.columns)

#Changing names of columns for easier coding
jeopardy_dataset = jeopardy_dataset.rename(columns={'Show Number': 'show_number',' Air Date': 'air_date',
                                                    ' Round': 'round', ' Category': 'category', ' Value': 'value',
                                                    ' Question': 'question', ' Answer': 'answer'})
print(jeopardy_dataset.columns)

#Finding total length of full dataset
print('\nThe total length of the dataset is ' + str(len(jeopardy_dataset)) + ' rows.')

  pd.set_option('display.max_colwidth', -1)


   Show Number    Air Date      Round                         Category  Value  \
0  4680         2004-12-31  Jeopardy!  HISTORY                          $200    
1  4680         2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200    
2  4680         2004-12-31  Jeopardy!  EVERYBODY TALKS ABOUT IT...      $200    
3  4680         2004-12-31  Jeopardy!  THE COMPANY LINE                 $200    
4  4680         2004-12-31  Jeopardy!  EPITAPHS & TRIBUTES              $200    
5  4680         2004-12-31  Jeopardy!  3-LETTER WORDS                   $200    
6  4680         2004-12-31  Jeopardy!  HISTORY                          $400    
7  4680         2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $400    
8  4680         2004-12-31  Jeopardy!  EVERYBODY TALKS ABOUT IT...      $400    
9  4680         2004-12-31  Jeopardy!  THE COMPANY LINE                 $400    

                                                                                                        Ques

### 1. Write a function that filters the dataset for questions that contains all of the words in a list of words

In [37]:
#Defining the word filter function
def word_filter(dataset, word_list):
    filter = lambda x: all(word.lower() in x.lower() for word in word_list)
    return dataset.loc[dataset["question"].apply(filter)]

#Testing the function
word_filter_test = word_filter(jeopardy_dataset, ['King', 'England'])
print(word_filter_test.question)

4953      Both England's King George V & FDR put their stamp of approval on this "King of Hobbies"              
6337      In retaliation for Viking raids, this "Unready" king of England attacks Norse areas of the Isle of Man
9191      This king of England beat the odds to trounce the French in the 1415 Battle of Agincourt              
11710     This Scotsman, the first Stuart king of England, was called "The Wisest Fool in Christendom"          
13454     It's the number that followed the last king of England named William                                  
                                          ...                                                                   
208295    In 1066 this great-great grandson of Rollo made what some call the last Viking invasion of England    
208742    Dutch-born king who ruled England jointly with Mary II & is a tasty New Zealand fish                  
213870    In 1781 William Herschel discovered Uranus & initially named it after this king of Eng

### 2. Convert the " Value" column to floats in a new column. Then, calculate the average value of questions that contain the word "King"

In [38]:
#Adding new column to the Jeopardy dataset with the values from 'Values' as floats
jeopardy_dataset['float_values'] = jeopardy_dataset.value.apply(lambda x: float(x[1:].replace(',','')) 
                                                                if x[0]=='$' else 0)

#Using the word filter function for obtaining a sub-dataframe of questions that contain the word "King"
questions_king_dataset = word_filter(jeopardy_dataset, ['King'])
print('The total length of the new dataset is ' + str(len(questions_king_dataset)) + ' rows.\n')

#Calculating the average value for the new dataframe
average_value_king = questions_king_dataset.float_values.mean()
print(average_value_king)

The total length of the new dataset is 7409 rows.

771.8833850722094


### 3. Write a function that returns the count of the unique answers to all of the questions in a dataset

In [39]:
#Writing function for calculating number of unique answers to question:
def count_unique_ans(dataset):
    count = dataset.answer.value_counts()
    return count

#Testing the function on dataset with questions that contain the word "King"
count_king_answer = count_unique_ans(questions_king_dataset)
print(count_king_answer)

Henry VIII     55
Solomon        35
Richard III    33
Louis XIV      31
David          30
               ..
Dan Aykroyd    1 
Dorothy        1 
the boom       1 
a sandwich     1 
"Hound Dog"    1 
Name: answer, Length: 5268, dtype: int64


### 4. Investigate the ways in which questions change over time by filtering by the date. E.g.: How many questions from the 90s use the word "Computer" compared to questions from the 2000s?

In [54]:
#Using the word filter function for obtaining a sub-dataframe of questions that contain the word "Computer"
questions_computer_dataset = word_filter(jeopardy_dataset, ['Computer'])
print('The total length of the dataset containing the word "Computer" in the questions field is ' + str(len(questions_computer_dataset)) + ' rows.\n')

#Defining the 'Air Date' filter function
def date_filter(dataset, decade):
    new_dataset = dataset[dataset.air_date.str[:3] == decade[:3]]
    return new_dataset

#Using the word filter function for obtaining a sub-dataframe of questions that contain belong to the 1980s
computer_1980_dataset = date_filter(questions_computer_dataset, '1980s')
print('The total length of the dataset containing the word "Computer" in the questions field and belonging to the \n1980s is ' + str(len(computer_1980_dataset)) + ' rows.\n')

#Using the word filter function for obtaining a sub-dataframe of questions that contain belong to the 1990s
computer_1990_dataset = date_filter(questions_computer_dataset, '1990s')
print('The total length of the dataset containing the word "Computer" in the questions field and belonging to the \n1990s is ' + str(len(computer_1990_dataset)) + ' rows.\n')

#Using the word filter function for obtaining a sub-dataframe of questions that contain belong to the 2000s
computer_2000_dataset = date_filter(questions_computer_dataset, '2000s')
print('The total length of the dataset containing the word "Computer" in the questions field and belonging to the \n2000s is ' + str(len(computer_2000_dataset)) + ' rows.\n')

#Using the word filter function for obtaining a sub-dataframe of questions that contain belong to the 2010s
computer_2010_dataset = date_filter(questions_computer_dataset, '2010s')
print('The total length of the dataset containing the word "Computer" in the questions field and belonging to the \n2010s is ' + str(len(computer_2010_dataset)) + ' rows.\n')

The total length of the dataset containing the word "Computer" in the questions field is 431 rows.

The total length of the dataset containing the word "Computer" in the questions field and belonging to the 
1980s is 6 rows.

The total length of the dataset containing the word "Computer" in the questions field and belonging to the 
1990s is 98 rows.

The total length of the dataset containing the word "Computer" in the questions field and belonging to the 
2000s is 268 rows.

The total length of the dataset containing the word "Computer" in the questions field and belonging to the 
2010s is 59 rows.



### 5. Find out if there is a connection between the round and the category. Are you more likely to find certain categories, like "Literature" in Single Jeopardy or Double Jeopardy?

In [68]:
#Creating groups and pivot table by round and category
print(len(jeopardy_dataset.category.unique()))
round_category = jeopardy_dataset.groupby(['category', 'round']).question.count().reset_index()
print(round_category)
round_category_pivot = round_category.pivot(columns='round', index='category', values='question')
print(round_category_pivot)

#Checking for the "Literature" category
literature = round_category[round_category.category == 'LITERATURE']
print(literature)
print('\nAnswer: There are more 105 questions of the LITERATURE category in the Jeopardy! round, 381 in the Double \nJeopardy! round, and 10 in the Final Jeopardy! round.')

27995
                          category             round  question
0       A JIM CARREY FILM FESTIVAL  Jeopardy!         5       
1      "!"                          Jeopardy!         5       
2      "-ARES"                      Double Jeopardy!  5       
3      "-ICIAN" EXPEDITION          Jeopardy!         5       
4      "...OD" WORDS                Double Jeopardy!  5       
...              ...                             ... ..       
31681  “R” MOVIES                   Double Jeopardy!  5       
31682  “SAINTS”                     Double Jeopardy!  4       
31683  “SOUTH”                      Double Jeopardy!  5       
31684  “STREETS”                    Jeopardy!         5       
31685  “WH”AT IS IT?                Double Jeopardy!  5       

[31686 rows x 3 columns]
round                        Double Jeopardy!  Final Jeopardy!  Jeopardy!  \
category                                                                    
 A JIM CARREY FILM FESTIVAL NaN               NaN         