## Final Project 

### UCLA 405 - Andrew Sang 505256314

This is a notebook that I will be using to analyze Jeopardy data!

In [1]:
# scraping
import requests
from bs4 import BeautifulSoup

# times
import time
import datetime

# data
from collections import defaultdict
import json
import pandas as pd
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [2]:
def get_bs4(url):
    page = requests.get(url)
    return BeautifulSoup(page.content, 'html.parser')

def parse_jeopardy():
    all_seasons = get_bs4("http://www.j-archive.com/listseasons.php")
    all_games = []
    seasons_list = all_seasons.find('div',id='content').find_all('a',href=True)[0:-1]

    for s, season in enumerate(seasons_list):
        try:
            season_start = datetime.datetime.now()
            ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 
            print('season {0} of {1}, @ {2}'.format(s+1, len(seasons_list), ts))

            season_url = 'http://www.j-archive.com/'+season['href']
            seas_num = season['href'].split('=')[1]
            episodes = (get_bs4(season_url).find('div', id='content')
                                           .find_all('tr'))

            for e, episode in enumerate(episodes):
                try: 
                    # print('season {0} episode {1} of {2}'.format(s+1, e+1, len(episodes)))
                    game_id = episode.find('td').find('a',href=True)['href'].split('game_id=')[1]
                    score_url = 'http://www.j-archive.com/showscores.php?game_id='+game_id
                    game_score = get_bs4(score_url)

                    episode_dict = {}
                    episode_dict['season'] = seas_num
                    episode_dict['game_id'] = game_id
                    episode_dict['scores'] = list()
                    scores_dict = dict()

                    # contestant data                    
                    contestants = (game_score.find('table',id='contestants_table')
                                             .find_all('td')[1]
                                             .find_all('a',href=True))
                    for c, contestant in enumerate(contestants):
                        episode_dict['contestant_{0}_name'.format(c)] = contestant.text
                        episode_dict['contestant_{0}_id'.format(c)] = contestant['href'].split('?player_id=')[1]
                        episode_dict['contestant_{0}_fname'.format(c)] = contestant.text.split(' ')[0]

                    # player data
                    players = game_score.find('table',class_="scores_table").find('tr').find_all('td',class_=True)
                    for p, player in enumerate(players):
                        episode_dict['player_{0}'.format(p)] = player.text

                    # score data
                    rounds = ['jeopardy_round','double_jeopardy_round','final_jeopardy_round']
                    for r, rnd in enumerate(rounds): # comment out
                        questions = game_score.find('div', id=rnd).find('table').find_all('tr')[1:] # comment out
                        for q, quest in enumerate(questions):
                            if rnd == 'final_jeopardy_round':
                                qnum = 0
                                qbody = questions[0].find_all('td')
                            else:
                                qnum = quest.find_all('td')[0].text
                                qbody = quest.find_all('td')[1:4]
                            for sc, scores in enumerate(qbody):
                                score = scores.text.replace('$','').replace(',','')
                                scores_dict['rnd{0}_qnum{1}_player{2}'.format(r, qnum, sc)] = score

                    # append
                    episode_dict['scores'].append(scores_dict)
                    all_games.append(episode_dict)
                except:
                    print('ep error season {0} game {1}, round {2}, url {3}'.format(seas_num,
                                                                                    game_id,
                                                                                    rnd,
                                                                                    score_url))
        except:
            print('seas error season {0} fail'.format(seas_num))
            
    ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    with open('data_generated_{0}.json'.format(ts), 'w+') as outfile:  
        json.dump(all_games, outfile)

In [14]:
# https://www.kaggle.com/jboysen/quick-tutorial-flatten-nested-json-in-pandas
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html

In [15]:
with open("data_generated_{0}.json".format(ts)) as json_file:
    data = json.load(json_file)

In [16]:
df = pd.melt(json_normalize(data,
                            record_path=['scores'], 
                            meta=['game_id', 'season']),
             id_vars=['game_id','season'],
             value_name='score')

In [17]:
df.loc[:,'rnd'] = pd.to_numeric(df['variable'].map(lambda s: s.split('_')[0].split('rnd')[1]))
df.loc[:,'question'] = pd.to_numeric(df['variable'].map(lambda s: s.split('_')[1].split('qnum')[1]))
df.loc[:,'player'] = df['variable'].map(lambda s: s.split('_')[2])

# cleanup
df = df.loc[pd.notnull(df['score']),:]
df = df.loc[df['score'].str.contains('lock') == False,:]
df.loc[:,'score'] = pd.to_numeric(df['score'])

In [18]:
df.to_csv('output_generated_{0}.csv'.format(ts))

In [25]:
pivot = pd.pivot_table(df, 
                       index=['season','game_id','rnd','question'],
                       columns='player',
                       values='score').reset_index()
pivot.to_csv('pivot_generated_{0}.csv'.format(ts))