# Tournaments

This creates a list of tournaments played in a given year range

In [1]:
import sys
import numpy as np
import pandas as pd
import time
from functions import getTournaments, getMatches, getPlayers, getRankings

pd.set_option('expand_frame_repr', False)

### Configuration
Specify which years to generate tournament listings for

In [2]:
base_url = "http://www.atpworldtour.com/"
start_year = "2002"

## Initialise the DataFrames

In [12]:
# Initialise an empty DataFrame with the columns we want
tournament_columns = ['Year', 'TourNo', 'Name', 'Date', 'Conditions', 'Surface', 'TournamentURL']
dfTours = pd.DataFrame([], columns = tournament_columns)

# Initialise an empty DataFrame with the columns we want
match_columns = ['TournamentID', 'MatchID', 'WinnerID', 'WinnerURL', 'WinnerName', 'WinnerSeed',
                                            'LoserID',  'LoserURL',  'LoserName',  'LoserSeed', 
                 'MatchURL']
dfMatches = pd.DataFrame([], columns=match_columns)

player_columns = ['PlayerID', 'Age', 'DoB', 'Weight', 'Height', 'YearWentPro', 'Hand', 'Backhand',
                  'TotalYTDWin', 'TotalYTDLoss', 'TotalAllWin', 'TotalAllLoss',
                  'ClayYTDWin', 'ClayYTDLoss', 'ClayAllWin', 'ClayAllLoss',
                  'GrassYTDWin', 'GrassYTDLoss', 'GrassAllWin', 'GrassAllLoss',
                  'HardYTDWin', 'HardYTDLoss', 'HardAllWin', 'HardAllLoss',
                  'CarpetYTDWin', 'CarpetYTDLoss', 'CarpetAllWin', 'CarpetAllLoss',
                  'IndoorYTDWin', 'IndoorYTDLoss', 'IndoorAllWin', 'IndoorAllLoss',
                  'OutdoorYTDWin', 'OutdoorYTDLoss', 'OutdoorAllWin', 'OutdoorAllLoss']
dfPlayers = pd.DataFrame([], columns = player_columns)

ranking_columns = ['PlayerID', 'MatchDate', 'Ranking']
dfRankings = pd.DataFrame([], columns = ranking_columns)

### Tournament Listing
Generate a list of the tournaments that occured for the specified years.

For each tournament we store the following attributes:
 * **Year** - What year did the tournament take place in?
 * **TourNo** - An incrementing count of the touraments in the year
 * **Name** - Name of the tournament
 * **Date** - Date the tournament started
 * **Conditions** - Outdoor or Indoor
 * **Surface** - Hard, Clay, Grass etc
 * **TournamentURL** - The URL where the tournament matches are listed
 
 The combination of Year and a tournament number are combined to create a unique index called **TournamentID**.
 
 The tournament listing is held in a Pandas DataFrame called dfTours.

In [4]:
# Iterate through the years and scrape tourney data
def getTournamentData(year):
    start_time = time.time()
    dfNewTours = pd.DataFrame(getTournaments(str(year)), columns = tournament_columns)
    end_time = time.time()
    print("Year " + year + ", Elapsed " + str(end_time-start_time))

    # Create an index of year plus tournament id
    dfNewTours['TournamentID'] = dfNewTours['TournamentURL'].str.split('\/').str[6] + "-" + dfNewTours['TournamentURL'].str.split('\/').str[5]
    dfNewTours = dfNewTours.set_index('TournamentID')

    return dfNewTours

## Match Results

Generate a list of the matches played in each tournament.

For each match the following attributes are recorded:
 * **TournamentID** - The unique index of dfTours
 * **MatchID** - An ID assigned to that match.  Only unique within a tournament.
 * **WinnerID** / **LoserID** - A unique player identifier.
 * **WinnerURL** / **LoserURL** - The URL of the players details.
 * **WinnerName** / **LoserName** - The players name.
 * **WinnerSeed** / **LoserSeed** - If the player is seeded what seed were they?  "WC" represents wild-card.
 * **MatchURL** - The URL where the detailed match statistics are available (not needed).
 
A multi-index of **TournamentID** and **MatchID** can be used to uniquely identify a match.

The match results are stored in a Pandas DataFrame called dfMatches.

In [5]:
# Iterate through each tournament and find the URLs with the match details
def getMatchData(dfTours):
    num_tournaments = len(dfTours)
    dfAllMatches = pd.DataFrame([], columns=match_columns)

    for counter in range(0,num_tournaments):
        start_time = time.time()

        tourney_url = dfTours.iloc[counter]['TournamentURL']

        tourney_id = tourney_url.split('/')[6] + "-" + tourney_url.split('/')[5]

        dfNewMatches = pd.DataFrame(getMatches(base_url, tourney_url+"?", tourney_id), columns = match_columns)

        if len(dfNewMatches) > 0:
            dfNewMatches['MatchNum'] = max(dfNewMatches.index) - dfNewMatches.index
        else:
            dfNewMatches['MatchNum'] = 0

        dfAllMatches = dfAllMatches.append(dfNewMatches)
        counter = counter+1
        end_time = time.time()
        print("Tournaments processed " + str(counter) + ", Elapsed " + str(end_time-start_time))

    dfAllMatches = dfAllMatches.set_index(['TournamentID', 'MatchID'])
    return dfAllMatches

## Player URLs

Generate a list of all the URLs where we can find the individual player statistics.  This will be held in a numpy array called playerURLs.

In [6]:
def getPlayerURLs(dfMatches):
    # Concatenate the list of WinnerURL's and LoserURL's from dfMatches and then ensure uniqueness.
    winnerURLs = dfMatches.as_matrix(columns=["WinnerURL"])
    loserURLs  = dfMatches.as_matrix(columns=["LoserURL"])

    return np.unique(np.concatenate((winnerURLs, loserURLs)))

## Player Statistics

Given a list of player URLs extract the following attributes for each player:
 * **PlayerID**
 * **Age**, **DOB**
 * **Weight**
 * **Height**
 * **YearWentPro**
 * **Hand**
 * **Backhand**
 * **TotalYTDWin**, **TotalYTDLoss**, **TotalAllWin**, **TotalAllLoss**
 * **Clay**
 * **Grass**
 * **Hard**
 * **Carpet**
 * **Indoor**
 * **Outdoor**
 
 This data is held in a DataFrame called **dfPlayers**.

In [7]:
def getPlayerData(playerURLs):
    dfAllPlayers = pd.DataFrame([], columns = player_columns)
    num_players = len(playerURLs)
    batch_size = 50

    player_counter = 0
    while (player_counter * batch_size) < num_players:
        start_time = time.time()
        start_from = player_counter * batch_size
        end_at = min(start_from + batch_size, num_players)

        dfNewPlayers = pd.DataFrame(getPlayers(base_url, playerURLs[start_from:end_at]), columns = player_columns)
        # dfNewPlayers = dfNewPlayers.set_index(['PlayerID'])

        mid_time = time.time()
        if len(dfAllPlayers) == 0:
            dfAllPlayers = dfNewPlayers
        else:
            dfAllPlayers = dfAllPlayers.append(dfNewPlayers)
        player_counter = player_counter + 1
        end_time = time.time()
        print("Loop " + str(player_counter) + ", Start " + str(start_from) + ", Mid " + str(mid_time-start_time) + ", Elapsed " + str(end_time-start_time))

    dfAllPlayers = dfAllPlayers.set_index('PlayerID')
    return dfAllPlayers

## Ranking Data

Player world ranking varies over time.  For each player extract the history of their ranking:
 * **PlayerID**
 * **MatchDate**
 * **Ranking**
 
This data is held in a DataFrame called **dfRankings**.

In [8]:
def getRankingData(playerURLs):
    num_players = len(playerURLs)
    batch_size = 100
    dfAllRankings = pd.DataFrame([], columns = ranking_columns)

    counter = 0
    while (counter * batch_size) < num_players:
        start_time = time.time()
        start_from = counter * batch_size
        end_at = min(start_from + batch_size, num_players)
        dfNewRankings = pd.DataFrame(getRankings(base_url, playerURLs[start_from:end_at]), columns = ranking_columns)
        #dfNewRankings = dfNewRankings.set_index(['PlayerID','MatchDate'])

        if len(dfAllRankings) == 0:
            dfAllRankings = dfNewRankings
        else:
            dfAllRankings = dfAllRankings.append(dfNewRankings)

        counter = counter + 1
        end_time = time.time()
        print("Loop " + str(counter) + ", Start " + str(start_from) + ", Elapsed " + str(end_time-start_time))

    dfAllRankings = dfAllRankings.set_index(['PlayerID', 'MatchDate'])
    return dfAllRankings

# Run Data Download

In [None]:
dfTours = getTournamentData(start_year)
print(dfTours)
dfTours.to_csv(path_or_buf="./data/dfTours_" + start_year + ".csv")

In [None]:
dfMatches = getMatchData(dfTours)
print(dfMatches)
dfMatches.to_csv(path_or_buf="./data/dfMatches_" + start_year + ".csv")

In [None]:
playerURLs = getPlayerURLs(dfMatches)

In [12]:
dfPlayers = getPlayerData(playerURLs)
dfPlayers.to_csv(path_or_buf="./data/dfPlayers_" + start_year + ".csv")

Loop 1, Start 0, Mid 10.815720081329346, Elapsed 10.815720081329346
Loop 2, Start 50, Mid 11.063208818435669, Elapsed 11.063208818435669
Loop 3, Start 100, Mid 10.502323627471924, Elapsed 10.502323627471924
Loop 4, Start 150, Mid 10.884016275405884, Elapsed 10.885016918182373
Loop 5, Start 200, Mid 11.09583044052124, Elapsed 11.096831321716309
Loop 6, Start 250, Mid 10.580931901931763, Elapsed 10.581931591033936
Loop 7, Start 300, Mid 3.39494252204895, Elapsed 3.39494252204895


In [14]:
dfRankings = getRankingData(playerURLs)
dfRankings.to_csv(path_or_buf="./data/dfRankings_" + start_year + ".csv")

Loop 1, Start 0, Elapsed 121.78270888328552
Loop 2, Start 100, Elapsed 123.66412377357483
Loop 3, Start 200, Elapsed 122.18555736541748
Loop 4, Start 300, Elapsed 20.09598469734192


# Get data for a year

In [9]:
def getDataForYear(year):
    dfTours = getTournamentData(year)
    dfTours.to_csv(path_or_buf="./data/dfTours_" + year + ".csv")
    
    dfMatches = getMatchData(dfTours)
    dfMatches.to_csv(path_or_buf="./data/dfMatches_" + year + ".csv")
    
    playerURLs = getPlayerURLs(dfMatches)
    
    dfPlayers = getPlayerData(playerURLs)
    dfPlayers.to_csv(path_or_buf="./data/dfPlayers_" + year + ".csv")

    dfRankings = getRankingData(playerURLs)
    dfRankings.to_csv(path_or_buf="./data/dfRankings_" + year + ".csv")
    
    return

In [13]:
getDataForYear("2004")

IndexError: list index out of range

# Load Datasets from File

In [8]:
dfTours = pd.DataFrame.from_csv(path="./data/dfTours_2002.csv", encoding="mbcs", index_col=0)
dfMatches = pd.DataFrame.from_csv(path="./data/dfMatches_2002.csv", encoding="mbcs", index_col=[0,1])
#dfPlayers = pd.DataFrame.from_csv(path="./dfPlayers.csv", encoding="mbcs", index_col=0)
#dfRankings = pd.DataFrame.from_csv(path="./dfRankings.csv", encoding="mbcs", index_col=[0,1])

#winnerURLs = dfMatches.as_matrix(columns=["WinnerURL"])
#loserURLs  = dfMatches.as_matrix(columns=["LoserURL"])
#playerURLs = np.unique(np.concatenate((winnerURLs, loserURLs)))

In [10]:
dfMatches

Unnamed: 0_level_0,Unnamed: 1_level_0,LoserID,LoserName,LoserSeed,LoserURL,MatchNum,MatchURL,WinnerID,WinnerName,WinnerSeed,WinnerURL
TournamentID,MatchID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2002-451,MS001,felix-mantilla:m535,Felix Mantilla,8,/en/players/felix-mantilla/m535,30.0,/en/scores/2002/451/MS001/match-stats?isLive=F...,younes-el-aynaoui:e121,Younes El Aynaoui,6,/en/players/younes-el-aynaoui/e121
2002-451,MS003,bohdan-ulihrach:u032,Bohdan Ulihrach,5,/en/players/bohdan-ulihrach/u032,29.0,/en/scores/2002/451/MS003/match-stats?isLive=F...,felix-mantilla:m535,Felix Mantilla,8,/en/players/felix-mantilla/m535
2002-451,MS002,rainer-schuettler:s636,Rainer Schuettler,7,/en/players/rainer-schuettler/s636,28.0,/en/scores/2002/451/MS002/match-stats?isLive=F...,younes-el-aynaoui:e121,Younes El Aynaoui,6,/en/players/younes-el-aynaoui/e121
2002-451,MS004,yevgeny-kafelnikov:k267,Yevgeny Kafelnikov,1,/en/players/yevgeny-kafelnikov/k267,27.0,/en/scores/2002/451/MS004/match-stats?isLive=F...,rainer-schuettler:s636,Rainer Schuettler,7,/en/players/rainer-schuettler/s636
2002-451,MS005,jiri-novak:n254,Jiri Novak,4,/en/players/jiri-novak/n254,26.0,/en/scores/2002/451/MS005/match-stats?isLive=F...,younes-el-aynaoui:e121,Younes El Aynaoui,6,/en/players/younes-el-aynaoui/e121
2002-451,MS006,fernando-vicente:v195,Fernando Vicente,,/en/players/fernando-vicente/v195,25.0,/en/scores/2002/451/MS006/match-stats?isLive=F...,bohdan-ulihrach:u032,Bohdan Ulihrach,5,/en/players/bohdan-ulihrach/u032
2002-451,MS007,radek-stepanek:s694,Radek Stepanek,Q,/en/players/radek-stepanek/s694,24.0,/en/scores/2002/451/MS007/match-stats?isLive=F...,felix-mantilla:m535,Felix Mantilla,8,/en/players/felix-mantilla/m535
2002-451,MS008,nikolay-davydenko:d402,Nikolay Davydenko,,/en/players/nikolay-davydenko/d402,23.0,/en/scores/2002/451/MS008/match-stats?isLive=F...,yevgeny-kafelnikov:k267,Yevgeny Kafelnikov,1,/en/players/yevgeny-kafelnikov/k267
2002-451,MS015,goran-ivanisevic:i034,Goran Ivanisevic,2,/en/players/goran-ivanisevic/i034,22.0,/en/scores/2002/451/MS015/match-stats?isLive=F...,radek-stepanek:s694,Radek Stepanek,Q,/en/players/radek-stepanek/s694
2002-451,MS010,antony-dupuis:d272,Antony Dupuis,,/en/players/antony-dupuis/d272,21.0,/en/scores/2002/451/MS010/match-stats?isLive=F...,jiri-novak:n254,Jiri Novak,4,/en/players/jiri-novak/n254
