In [311]:
import pandas as pd
import datetime
import random

In [312]:
pd.set_option('display.max_colwidth', None)

    We'll begin by importing the csv file and printing the header

In [313]:
df = pd.read_csv('jeopardy.csv')

In [314]:
print(df.head())

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                                                                                      Question  \
0             For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory   
1  No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves   
2                     The city of Yuma in this state has a record average of 4,055 hours of sunshine each year   
3                         In 1963, live on "The Art Linkl

In [315]:
print(df.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


### Cleaning the Columns

        Printing the columns it becomes evident some have unecessary
        white spaces 

In [316]:
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', "_")

In [317]:
df.columns

Index(['Show_Number', 'Air_Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

        This removed the white spaces from our column titles 

### Filtering Our Data based on Question Content

        We'll begin by creating a function that'll enable us to filter questions 
        containing a pre-defined list of words

In [318]:
def filter_data(dataset, filter_words):
    filter_function = lambda x: all(word.lower() in x.lower() for word in filter_words)
    # Converted everything to lowercase to avoid any issues due to capitalization
    return dataset.loc[dataset['Question'].apply(filter_function)]
    # This returns the rows where the question contained all of the specified words from our list

        all(list comprehension) goes through the various words in filter_words and 
        evaluates them to see if they're in the question.
        
        It'll return true if all of our words are within the question; else it returns
        false

In [319]:
filter_applied = filter_data(df,['King', 'England'])

        Any returns rows must contain 'King' and 'England' within their questions

In [320]:
len(filter_applied)

152

        This is used as a final check to examine how many rows met our condition list above

### Converting Numbers to Numerical Values

In [321]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show_Number  216930 non-null  int64 
 1   Air_Date     216930 non-null  object
 2   Round        216930 non-null  object
 3   Category     216930 non-null  object
 4   Value        216930 non-null  object
 5   Question     216930 non-null  object
 6   Answer       216928 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


        This gives us a better sense of our columns and their data types. If we want to 
        have greater functionality of the Value column we'd need to change it to the float
        datatype

#### Removing the $

In [322]:
df.Value = df.Value.str.lstrip("$")

#### Converting Values to Float

In [323]:
conversion = lambda x: float(x.replace(',','') if x != 'None' else 0)
df['Value'] = df['Value'].apply(conversion)

        After completing this step our value for the 'Value' column will be float64

### Average Value of Questions 

        Let's say we're very knowledgeable about 'Kings'. We can calculate the mean of 
        questions related to our topic of expertise.

In [324]:
topic_expertise = filter_data(df,['King'])
topic_expertise.Value.mean()

771.8833850722094

        Here we can see the average value of questions related to 'Kings' is $771.88

#### Examining Different Answers to Questions

        Since we're an expert on the topic of 'king' let's explore what different
        answers there are to questions related to our expertise. 

In [325]:
def answers(filtered_dataset):
    return filtered_dataset['Answer'].value_counts()

In [326]:
answers(topic_expertise)

Henry VIII           55
Solomon              35
Richard III          33
Louis XIV            31
David                30
                     ..
an oratory            1
void                  1
the Blarney Stone     1
Constantine (II)      1
the crown             1
Name: Answer, Length: 5268, dtype: int64

### Changing the Date to DateTime

In [327]:
df['Date'] = df['Air_Date'].apply(lambda x: pd.to_datetime(x))

        This converts the values into datatime datatype

### Popularity of Computers

        We'll analyze the popularity of computer using the number of questions that 
        explicit contain 'computer' as a proxy

In [328]:
computer_filter = filter_data(df,['Computer'])
# Creating a new dataframe with questions containing 'computer'

        We'll divide this analysis into two times periods.
            Period One: January 1, 1990 - December 31, 1999
            Period Two: January 1, 2000 - December 31, 2009

In [329]:
computer_filter_90s = computer_filter[(computer_filter['Date'] > datetime.datetime(1990,1,1)) & (computer_filter['Date'] < datetime.datetime(1999,12,31))]
                      # Used datetime.datetime to get datetime64[ns]; this enables us to compare them

In [330]:
computer_filter_20s = computer_filter[(computer_filter['Date'] > datetime.datetime(2000,1,1)) & (computer_filter['Date'] < datetime.datetime(2009,12,31))]

In [331]:
print('In the period from 1990 - 1999 there were {} questions regarding computers.\nWhereas the period from 2000 - 2009 featured {} questions regarding computers'.format(str(len(computer_filter_90s)),str(len(computer_filter_20s))))

In the period from 1990 - 1999 there were 98 questions regarding computers.
Whereas the period from 2000 - 2009 featured 267 questions regarding computers


### Futher Analysis

        Let's say we wanted to investigate this further and check the popularity of 
        'computers' on a yearly basis

In [332]:
df['Year'] = pd.to_numeric(df['Air_Date'].str[:4])
# Creates a new column with numerical year values 

In [333]:
computer_filter_yearly = filter_data(df,['Computer'])
# Creates a new dataset to examine

        Let's group the frequency of computer related questions by the year they appeared

In [334]:
computer_year_sort = computer_filter_yearly.groupby(['Year']).Show_Number.count().reset_index()

        This returns the year with the number of questions explicitly mentioning
        'computer'

In [335]:
computer_year_sort.max()[0]
# This returns the year with the greatest number of questions that explicit contains 'computer'

2012

In [336]:
computer_year_sort.max()[1]
# This returns the number of question within the most popular year

45

### Self-Quiz

        Let's build a short program that enables users to randomly test their 
        knowledge using our jeopardy questions

In [339]:
def question_generator(dataframe):
    selection = 'yes'
    correct = 0
    wrong = 0
    while selection == 'yes':
        
        # This generates a random question for the user
        random_question = random.randint(1, len(dataframe))
        print(dataframe.Question[random_question])
        
        # This provides users the ability to answer the question
        user_input = input()
        
        # This checks if the answer is correct
        if user_input.lower() == dataframe.Answer[random_question].lower():
            correct += 1
            if correct > 1 and wrong > 1:
                print('Correct! You have {0} correct answers and {1} wrong answers'.format(str(correct), str(wrong)))
            elif correct > 1 and wrong == 1:
                print('Correct! You have {0} correct answers and {1} wrong answer'.format(str(correct), str(wrong)))
            elif correct > 1 and wrong == 0:
                print('Correct! You have {0} correct answers and no wrong answer'.format(str(correct)))
            elif correct == 1 and wrong == 0:
                print('Correct! You have {0} correct answer and no wrong answer'.format(str(correct)))
            elif correct == 1 and wrong > 1:
                print('Correct! You have {0} correct answer and {1} wrong answer'.format(str(correct), str(wrong)))
        else:
            wrong += 1
            if correct > 1 and wrong > 1:
                print('Wrong! You have {0} correct answers and {1} wrong answers.'.format(str(correct), str(wrong)))
            elif correct > 1 and wrong == 1:
                print('Wrong! You have {0} correct answers and {1} wrong answer'.format(str(correct), str(wrong)))
            elif correct == 0 and wrong == 1:
                print('Wrong! You have no correct answers and {0} wrong answer'.format(str(wrong)))
            elif correct == 1 and wrong == 1:
                print('Wrong! You have {0} correct answer and {1} wrong answer.'.format(str(correct), str(wrong)))
            elif correct == 0 and wrong > 1:
                print('Wrong! You have no correct and {0} wrong answer.'.format(str(wrong)))
            print('Please try again! The correct answer was ' + str(dataframe.Answer[random_question].lower()))

        print()
        print('If you\'d like another question please type \'yes\'. If not please type \'no\'')

        response = input()
    
        # This would keep running the game if user selects 'yes'
        if response.lower() == 'yes':
              selection = 'yes'
        elif response.lower() == 'no':
              selection = 'no'
              print("Goodbye")
        else:
              print('Invalid response! Program will close')

In [341]:
# question_generator(df)