# Tournaments

This creates a list of tournaments played in a given year range

In [1]:
import sys
import numpy as np
import pandas as pd
from functions import tournaments, matches, players

pd.set_option('expand_frame_repr', False)

### Configuration
Specify which years to generate tournament listings for

In [2]:
base_url = "http://www.atpworldtour.com/"
start_year = "2018"
end_year   = "2018"

In [3]:
players(base_url,["/en/players/ricardo-ojeda-lara/o408/"])

AGE:25
DOB:1993.01.26
Kgs:76kg
Cms:180cm


[]

### Tournament Listing
Generate a list of the tournaments that occured for the specified years.

For each tournament we store the following attributes:
 * **Year** - What year did the tournament take place in?
 * **TourNo** - An incrementing count of the touraments in the year
 * **Name** - Name of the tournament
 * **Date** - Date the tournament started
 * **Conditions** - Outdoor or Indoor
 * **Surface** - Hard, Clay, Grass etc
 * **TournamentURL** - The URL where the tournament matches are listed
 
 The combination of Year and a tournament number are combined to create a unique index called **TournamentID**.
 
 The tournament listing is held in a Pandas DataFrame called dfTours.

In [None]:
# Iterate through the years and scrape tourney data

# Initialise an empty DataFrame with the columns we want
tournament_columns = ['Year', 'TourNo', 'Name', 'Date', 'Conditions', 'Surface', 'TournamentURL']
dfTours = pd.DataFrame([], columns = tournament_columns)

for h in range(int(start_year), int(end_year) + 1):
    year = str(h)
    dfNewTour = pd.DataFrame(tournaments(year), columns = tournament_columns)
    dfTours = dfTours.append(dfNewTour)

# Create an index of year plus tournament id
dfTours['TournamentID'] = dfTours['TournamentURL'].str.split('\/').str[6] + "-" + dfTours['TournamentURL'].str.split('\/').str[5]
dfTours = dfTours.set_index('TournamentID')

print(dfTours)

## Match Results

Generate a list of the matches played in each tournament.

For each match the following attributes are recorded:
 * **TournamentID** - The unique index of dfTours
 * **MatchID** - An ID assigned to that match.  Only unique within a tournament.
 * **WinnerID** / **LoserID** - A unique player identifier.
 * **WinnerURL** / **LoserURL** - The URL of the players details.
 * **WinnerName** / **LoserName** - The players name.
 * **WinnerSeed** / **LoserSeed** - If the player is seeded what seed were they?  "WC" represents wild-card.
 * **MatchURL** - The URL where the detailed match statistics are available (not needed).
 
A multi-index of **TournamentID** and **MatchID** can be used to uniquely identify a match.

The match results are stored in a Pandas DataFrame called dfMatches.

In [None]:
# Iterate through each tournament and find the URLs with the match details

# Initialise an empty DataFrame with the columns we want
match_columns = ['TournamentID', 'MatchID', 'WinnerID', 'WinnerURL', 'WinnerName', 'WinnerSeed',
                                            'LoserID',  'LoserURL',  'LoserName',  'LoserSeed', 
                 'MatchURL']
dfMatches = pd.DataFrame([], columns=match_columns)

for index, tourney in dfTours.iterrows():
    tourney_url = tourney['TournamentURL']
    tourney_id = tourney_url.split('/')[6] + "-" + tourney_url.split('/')[5]
    
    dfNewMatches = pd.DataFrame(matches(base_url, tourney_url+"?", tourney_id), columns = match_columns)
    if len(dfNewMatches) > 0:
        dfNewMatches['MatchNum'] = max(dfNewMatches.index) - dfNewMatches.index
    else:
        dfNewMatches['MatchNum'] = 0
        
    dfMatches = dfMatches.append(dfNewMatches)

dfMatches = dfMatches.set_index(['TournamentID', 'MatchID'])
dfMatches