# Tournaments

This creates a list of tournaments played in a given year range

In [1]:
import sys
import numpy as np
import pandas as pd
import time
from functions import getTournaments, getMatches, getPlayers, getRankings, getTournaments2

pd.set_option('expand_frame_repr', False)

### Configuration
Specify which years to generate tournament listings for

In [2]:
base_url = "http://www.atpworldtour.com/"
start_year = "2004"

## Initialise the DataFrames

In [3]:
# Initialise an empty DataFrame with the columns we want
tournament_columns = ['Year', 'TourNo', 'Name', 'Date', 'Conditions', 'Surface', 'Prize', 'TournamentURL']
dfTours = pd.DataFrame([], columns = tournament_columns)

# Initialise an empty DataFrame with the columns we want
match_columns = ['TournamentID', 'MatchID', 'WinnerID', 'WinnerURL', 'WinnerName', 'WinnerSeed',
                                            'LoserID',  'LoserURL',  'LoserName',  'LoserSeed', 
                 'MatchURL']
dfMatches = pd.DataFrame([], columns=match_columns)

player_columns = ['PlayerID', 'Age', 'DoB', 'Weight', 'Height', 'YearWentPro', 'Hand', 'Backhand',
                  'TotalYTDWin', 'TotalYTDLoss', 'TotalAllWin', 'TotalAllLoss',
                  'ClayYTDWin', 'ClayYTDLoss', 'ClayAllWin', 'ClayAllLoss',
                  'GrassYTDWin', 'GrassYTDLoss', 'GrassAllWin', 'GrassAllLoss',
                  'HardYTDWin', 'HardYTDLoss', 'HardAllWin', 'HardAllLoss',
                  'CarpetYTDWin', 'CarpetYTDLoss', 'CarpetAllWin', 'CarpetAllLoss',
                  'IndoorYTDWin', 'IndoorYTDLoss', 'IndoorAllWin', 'IndoorAllLoss',
                  'OutdoorYTDWin', 'OutdoorYTDLoss', 'OutdoorAllWin', 'OutdoorAllLoss']
dfPlayers = pd.DataFrame([], columns = player_columns)

ranking_columns = ['PlayerID', 'MatchDate', 'Ranking']
dfRankings = pd.DataFrame([], columns = ranking_columns)

### Tournament Listing
Generate a list of the tournaments that occured for the specified years.

For each tournament we store the following attributes:
 * **Year** - What year did the tournament take place in?
 * **TourNo** - An incrementing count of the touraments in the year
 * **Name** - Name of the tournament
 * **Date** - Date the tournament started
 * **Conditions** - Outdoor or Indoor
 * **Surface** - Hard, Clay, Grass etc
 * **Prixe** - Prize money at stake
 * **TournamentURL** - The URL where the tournament matches are listed
 
 The combination of Year and a tournament number are combined to create a unique index called **TournamentID**.
 
 The tournament listing is held in a Pandas DataFrame called dfTours.

In [4]:
# Iterate through the years and scrape tourney data
def getTournamentData(year):
    start_time = time.time()
    dfNewTours = pd.DataFrame(getTournaments(str(year)), columns = tournament_columns)
    end_time = time.time()
    print("Year " + year + ", Elapsed " + str(end_time-start_time))

    # Create an index of year plus tournament id
    dfNewTours['TournamentID'] = dfNewTours['TournamentURL'].str.split('\/').str[6] + "-" + dfNewTours['TournamentURL'].str.split('\/').str[5]
    dfNewTours = dfNewTours.set_index('TournamentID')

    return dfNewTours

## Match Results

Generate a list of the matches played in each tournament.

For each match the following attributes are recorded:
 * **TournamentID** - The unique index of dfTours
 * **MatchID** - An ID assigned to that match.  Only unique within a tournament.
 * **WinnerID** / **LoserID** - A unique player identifier.
 * **WinnerURL** / **LoserURL** - The URL of the players details.
 * **WinnerName** / **LoserName** - The players name.
 * **WinnerSeed** / **LoserSeed** - If the player is seeded what seed were they?  "WC" represents wild-card.
 * **MatchURL** - The URL where the detailed match statistics are available (not needed).
 
A multi-index of **TournamentID** and **MatchID** can be used to uniquely identify a match.

The match results are stored in a Pandas DataFrame called dfMatches.

In [5]:
# Iterate through each tournament and find the URLs with the match details
def getMatchData(dfTours):
    num_tournaments = len(dfTours)
    dfAllMatches = pd.DataFrame([], columns=match_columns)

    for counter in range(0,num_tournaments):
        start_time = time.time()

        tourney_url = dfTours.iloc[counter]['TournamentURL']

        tourney_id = tourney_url.split('/')[6] + "-" + tourney_url.split('/')[5]

        dfNewMatches = pd.DataFrame(getMatches(base_url, tourney_url+"?", tourney_id), columns = match_columns)

        if len(dfNewMatches) > 0:
            dfNewMatches['MatchNum'] = max(dfNewMatches.index) - dfNewMatches.index
        else:
            dfNewMatches['MatchNum'] = 0

        dfAllMatches = dfAllMatches.append(dfNewMatches)
        counter = counter+1
        end_time = time.time()
        print("Tournaments processed " + str(counter) + ", Elapsed " + str(end_time-start_time))

    dfAllMatches = dfAllMatches.set_index(['TournamentID', 'MatchID'])
    return dfAllMatches

## Player URLs

Generate a list of all the URLs where we can find the individual player statistics.  This will be held in a numpy array called playerURLs.

In [6]:
def getPlayerURLs(dfMatches):
    # Concatenate the list of WinnerURL's and LoserURL's from dfMatches and then ensure uniqueness.
    winnerURLs = dfMatches.as_matrix(columns=["WinnerURL"])
    loserURLs  = dfMatches.as_matrix(columns=["LoserURL"])

    return np.unique(np.concatenate((winnerURLs, loserURLs)))

## Player Statistics

Given a list of player URLs extract the following attributes for each player:
 * **PlayerID**
 * **Age**, **DOB**
 * **Weight**
 * **Height**
 * **YearWentPro**
 * **Hand**
 * **Backhand**
 * **TotalYTDWin**, **TotalYTDLoss**, **TotalAllWin**, **TotalAllLoss**
 * **Clay**
 * **Grass**
 * **Hard**
 * **Carpet**
 * **Indoor**
 * **Outdoor**
 
 This data is held in a DataFrame called **dfPlayers**.

In [7]:
def getPlayerData(playerURLs):
    dfAllPlayers = pd.DataFrame([], columns = player_columns)
    num_players = len(playerURLs)
    batch_size = 50

    player_counter = 0
    while (player_counter * batch_size) < num_players:
        start_time = time.time()
        start_from = player_counter * batch_size
        end_at = min(start_from + batch_size, num_players)

        dfNewPlayers = pd.DataFrame(getPlayers(base_url, playerURLs[start_from:end_at]), columns = player_columns)
        # dfNewPlayers = dfNewPlayers.set_index(['PlayerID'])

        mid_time = time.time()
        if len(dfAllPlayers) == 0:
            dfAllPlayers = dfNewPlayers
        else:
            dfAllPlayers = dfAllPlayers.append(dfNewPlayers)
        player_counter = player_counter + 1
        end_time = time.time()
        print("Loop " + str(player_counter) + ", Start " + str(start_from) + ", Mid " + str(mid_time-start_time) + ", Elapsed " + str(end_time-start_time))

    dfAllPlayers = dfAllPlayers.set_index('PlayerID')
    return dfAllPlayers

## Ranking Data

Player world ranking varies over time.  For each player extract the history of their ranking:
 * **PlayerID**
 * **MatchDate**
 * **Ranking**
 
This data is held in a DataFrame called **dfRankings**.

In [8]:
def getRankingData(playerURLs):
    num_players = len(playerURLs)
    batch_size = 100
    dfAllRankings = pd.DataFrame([], columns = ranking_columns)

    counter = 0
    while (counter * batch_size) < num_players:
        start_time = time.time()
        start_from = counter * batch_size
        end_at = min(start_from + batch_size, num_players)
        dfNewRankings = pd.DataFrame(getRankings(base_url, playerURLs[start_from:end_at]), columns = ranking_columns)
        #dfNewRankings = dfNewRankings.set_index(['PlayerID','MatchDate'])

        if len(dfAllRankings) == 0:
            dfAllRankings = dfNewRankings
        else:
            dfAllRankings = dfAllRankings.append(dfNewRankings)

        counter = counter + 1
        end_time = time.time()
        print("Loop " + str(counter) + ", Start " + str(start_from) + ", Elapsed " + str(end_time-start_time))

    dfAllRankings = dfAllRankings.set_index(['PlayerID', 'MatchDate'])
    return dfAllRankings

# Get data for a year

In [9]:
def getDataForYear(year):
    dfTours = getTournamentData(year)
    dfTours.to_csv(path_or_buf="./data/dfTours_" + year + ".csv")
    
    dfMatches = getMatchData(dfTours)
    dfMatches.to_csv(path_or_buf="./data/dfMatches_" + year + ".csv")
    
    playerURLs = getPlayerURLs(dfMatches)
    
    dfPlayers = getPlayerData(playerURLs)
    dfPlayers.to_csv(path_or_buf="./data/dfPlayers_" + year + ".csv")

    dfRankings = getRankingData(playerURLs)
    dfRankings.to_csv(path_or_buf="./data/dfRankings_" + year + ".csv")
    
    return

In [10]:
getDataForYear("2016")

Year 2016, Elapsed 2.395132541656494
Tournaments processed 1, Elapsed 0.2556595802307129
Tournaments processed 2, Elapsed 0.24715399742126465
Tournaments processed 3, Elapsed 0.20712542533874512
Tournaments processed 4, Elapsed 0.20864152908325195
Tournaments processed 5, Elapsed 0.2373206615447998
Tournaments processed 6, Elapsed 1.9063615798950195
Tournaments processed 7, Elapsed 0.2480926513671875
Tournaments processed 8, Elapsed 0.2916078567504883
Tournaments processed 9, Elapsed 0.23090577125549316
Tournaments processed 10, Elapsed 0.22601842880249023
Tournaments processed 11, Elapsed 0.23982024192810059
Tournaments processed 12, Elapsed 0.24078631401062012
Tournaments processed 13, Elapsed 0.25604844093322754
Tournaments processed 14, Elapsed 0.2186870574951172
Tournaments processed 15, Elapsed 0.26115918159484863
Tournaments processed 16, Elapsed 0.21389460563659668
Tournaments processed 17, Elapsed 0.23288393020629883
Tournaments processed 18, Elapsed 0.24731731414794922
Tourna

In [11]:
getDataForYear("2017")

Year 2017, Elapsed 1.1111490726470947
Tournaments processed 1, Elapsed 0.2058241367340088
Tournaments processed 2, Elapsed 0.4523494243621826
Tournaments processed 3, Elapsed 0.2268376350402832
Tournaments processed 4, Elapsed 0.21000957489013672
Tournaments processed 5, Elapsed 0.22619128227233887
Tournaments processed 6, Elapsed 1.6707687377929688
Tournaments processed 7, Elapsed 0.2079932689666748
Tournaments processed 8, Elapsed 0.228302001953125
Tournaments processed 9, Elapsed 0.2393498420715332
Tournaments processed 10, Elapsed 0.20876121520996094
Tournaments processed 11, Elapsed 0.236907958984375
Tournaments processed 12, Elapsed 0.21512460708618164
Tournaments processed 13, Elapsed 0.2507929801940918
Tournaments processed 14, Elapsed 0.23708462715148926
Tournaments processed 15, Elapsed 0.20755839347839355
Tournaments processed 16, Elapsed 0.9693701267242432
Tournaments processed 17, Elapsed 0.20932412147521973
Tournaments processed 18, Elapsed 0.23791027069091797
Tournaments

In [12]:
getDataForYear("2018")

Year 2018, Elapsed 0.7504823207855225
Tournaments processed 1, Elapsed 0.2162644863128662
Tournaments processed 2, Elapsed 0.2238025665283203
Tournaments processed 3, Elapsed 0.24199628829956055
Tournaments processed 4, Elapsed 0.24570083618164062
Tournaments processed 5, Elapsed 0.23999381065368652
Tournaments processed 6, Elapsed 0.2627863883972168
Tournaments processed 7, Elapsed 0.20903587341308594
Tournaments processed 8, Elapsed 0.23115181922912598
Tournaments processed 9, Elapsed 0.23227858543395996
Tournaments processed 10, Elapsed 0.25591039657592773
Tournaments processed 11, Elapsed 0.2123575210571289
Tournaments processed 12, Elapsed 0.2212657928466797
Tournaments processed 13, Elapsed 0.23500514030456543
Tournaments processed 14, Elapsed 0.23789572715759277
Tournaments processed 15, Elapsed 0.23879504203796387
Tournaments processed 16, Elapsed 0.22511076927185059
Tournaments processed 17, Elapsed 0.20586562156677246
Tournaments processed 18, Elapsed 0.2317655086517334
Tourn

In [10]:
getDataForYear("2019")

Year 2019, Elapsed 2.3878793716430664
Tournaments processed 1, Elapsed 0.22399520874023438
Tournaments processed 2, Elapsed 0.21076107025146484
Tournaments processed 3, Elapsed 0.207627534866333
Tournaments processed 4, Elapsed 0.23376917839050293
Tournaments processed 5, Elapsed 0.24065375328063965
Tournaments processed 6, Elapsed 0.2195582389831543
Tournaments processed 7, Elapsed 0.2166426181793213
Tournaments processed 8, Elapsed 0.23970270156860352
Tournaments processed 9, Elapsed 0.2239069938659668
Tournaments processed 10, Elapsed 0.21102452278137207
Tournaments processed 11, Elapsed 0.22275447845458984
Tournaments processed 12, Elapsed 0.2181382179260254
Tournaments processed 13, Elapsed 0.22945237159729004
Tournaments processed 14, Elapsed 0.1970973014831543
Tournaments processed 15, Elapsed 0.2325754165649414
Tournaments processed 16, Elapsed 0.22622036933898926
Tournaments processed 17, Elapsed 0.24321937561035156
Tournaments processed 18, Elapsed 0.24246692657470703
Tournam

# Run Data Download

In [20]:
dfTours = getTournamentData(start_year)
print(dfTours)
dfTours.to_csv(path_or_buf="./data/dfTours_" + start_year + ".csv")

Year 2004, Elapsed 0.2774839401245117
              Year TourNo                           Name        Date Conditions Surface       Prize                                      TournamentURL
TournamentID                                                                                                                                          
2004-451      2004      1                           Doha  2004.01.05    Outdoor    Hard  $1,000,000           /en/scores/archive/doha/451/2004/results
2004-7308     2004      2                       Adelaide  2004.01.05    Outdoor    Hard    $380,000      /en/scores/archive/adelaide/7308/2004/results
2004-891      2004      3                        Chennai  2004.01.05    Outdoor    Hard    $380,000        /en/scores/archive/chennai/891/2004/results
2004-301      2004      4                       Auckland  2004.01.12    Outdoor    Hard    $404,000       /en/scores/archive/auckland/301/2004/results
2004-338      2004      5                         Sydney

In [10]:
dfMatches = getMatchData(dfTours)
print(dfMatches)
dfMatches.to_csv(path_or_buf="./data/dfMatches_" + start_year + ".csv")

Tournaments processed 1, Elapsed 1.351186752319336
Tournaments processed 2, Elapsed 1.203507423400879
Tournaments processed 3, Elapsed 0.897174596786499
Tournaments processed 4, Elapsed 1.2089507579803467
Tournaments processed 5, Elapsed 1.163109302520752
Tournaments processed 6, Elapsed 1.4121582508087158
Tournaments processed 7, Elapsed 1.317711591720581
Tournaments processed 8, Elapsed 1.350632667541504
Tournaments processed 9, Elapsed 1.5317957401275635
Tournaments processed 10, Elapsed 1.1874752044677734
Tournaments processed 11, Elapsed 3.3351454734802246
Tournaments processed 12, Elapsed 1.2302637100219727
Tournaments processed 13, Elapsed 1.3520641326904297
Tournaments processed 14, Elapsed 1.3263823986053467
Tournaments processed 15, Elapsed 1.2967503070831299
Tournaments processed 16, Elapsed 1.4118127822875977
Tournaments processed 17, Elapsed 1.303405523300171
Tournaments processed 18, Elapsed 1.435990333557129
Tournaments processed 19, Elapsed 1.691483497619629
Tournaments

In [11]:
playerURLs = getPlayerURLs(dfMatches)

In [12]:
dfPlayers = getPlayerData(playerURLs)
dfPlayers.to_csv(path_or_buf="./data/dfPlayers_" + start_year + ".csv")

Loop 1, Start 0, Mid 55.430259227752686, Elapsed 55.430259227752686
Loop 2, Start 50, Mid 57.554909229278564, Elapsed 57.554909229278564
Loop 3, Start 100, Mid 55.859797954559326, Elapsed 55.859797954559326
Loop 4, Start 150, Mid 55.57727241516113, Elapsed 55.57727241516113
Loop 5, Start 200, Mid 54.706568479537964, Elapsed 54.707566022872925
Loop 6, Start 250, Mid 53.294530153274536, Elapsed 53.294530153274536
Loop 7, Start 300, Mid 40.586875677108765, Elapsed 40.586875677108765


In [13]:
dfRankings = getRankingData(playerURLs)
dfRankings.to_csv(path_or_buf="./data/dfRankings_" + start_year + ".csv")

Loop 1, Start 0, Elapsed 228.5743372440338
Loop 2, Start 100, Elapsed 128.24384951591492
Loop 3, Start 200, Elapsed 125.87894868850708
Loop 4, Start 300, Elapsed 46.32646441459656


# Load Datasets from File

In [8]:
dfTours = pd.DataFrame.from_csv(path="./data/dfTours_2002.csv", encoding="mbcs", index_col=0)
dfMatches = pd.DataFrame.from_csv(path="./data/dfMatches_2002.csv", encoding="mbcs", index_col=[0,1])
#dfPlayers = pd.DataFrame.from_csv(path="./dfPlayers.csv", encoding="mbcs", index_col=0)
#dfRankings = pd.DataFrame.from_csv(path="./dfRankings.csv", encoding="mbcs", index_col=[0,1])

#winnerURLs = dfMatches.as_matrix(columns=["WinnerURL"])
#loserURLs  = dfMatches.as_matrix(columns=["LoserURL"])
#playerURLs = np.unique(np.concatenate((winnerURLs, loserURLs)))

In [10]:
dfMatches

Unnamed: 0_level_0,Unnamed: 1_level_0,LoserID,LoserName,LoserSeed,LoserURL,MatchNum,MatchURL,WinnerID,WinnerName,WinnerSeed,WinnerURL
TournamentID,MatchID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2002-451,MS001,felix-mantilla:m535,Felix Mantilla,8,/en/players/felix-mantilla/m535,30.0,/en/scores/2002/451/MS001/match-stats?isLive=F...,younes-el-aynaoui:e121,Younes El Aynaoui,6,/en/players/younes-el-aynaoui/e121
2002-451,MS003,bohdan-ulihrach:u032,Bohdan Ulihrach,5,/en/players/bohdan-ulihrach/u032,29.0,/en/scores/2002/451/MS003/match-stats?isLive=F...,felix-mantilla:m535,Felix Mantilla,8,/en/players/felix-mantilla/m535
2002-451,MS002,rainer-schuettler:s636,Rainer Schuettler,7,/en/players/rainer-schuettler/s636,28.0,/en/scores/2002/451/MS002/match-stats?isLive=F...,younes-el-aynaoui:e121,Younes El Aynaoui,6,/en/players/younes-el-aynaoui/e121
2002-451,MS004,yevgeny-kafelnikov:k267,Yevgeny Kafelnikov,1,/en/players/yevgeny-kafelnikov/k267,27.0,/en/scores/2002/451/MS004/match-stats?isLive=F...,rainer-schuettler:s636,Rainer Schuettler,7,/en/players/rainer-schuettler/s636
2002-451,MS005,jiri-novak:n254,Jiri Novak,4,/en/players/jiri-novak/n254,26.0,/en/scores/2002/451/MS005/match-stats?isLive=F...,younes-el-aynaoui:e121,Younes El Aynaoui,6,/en/players/younes-el-aynaoui/e121
2002-451,MS006,fernando-vicente:v195,Fernando Vicente,,/en/players/fernando-vicente/v195,25.0,/en/scores/2002/451/MS006/match-stats?isLive=F...,bohdan-ulihrach:u032,Bohdan Ulihrach,5,/en/players/bohdan-ulihrach/u032
2002-451,MS007,radek-stepanek:s694,Radek Stepanek,Q,/en/players/radek-stepanek/s694,24.0,/en/scores/2002/451/MS007/match-stats?isLive=F...,felix-mantilla:m535,Felix Mantilla,8,/en/players/felix-mantilla/m535
2002-451,MS008,nikolay-davydenko:d402,Nikolay Davydenko,,/en/players/nikolay-davydenko/d402,23.0,/en/scores/2002/451/MS008/match-stats?isLive=F...,yevgeny-kafelnikov:k267,Yevgeny Kafelnikov,1,/en/players/yevgeny-kafelnikov/k267
2002-451,MS015,goran-ivanisevic:i034,Goran Ivanisevic,2,/en/players/goran-ivanisevic/i034,22.0,/en/scores/2002/451/MS015/match-stats?isLive=F...,radek-stepanek:s694,Radek Stepanek,Q,/en/players/radek-stepanek/s694
2002-451,MS010,antony-dupuis:d272,Antony Dupuis,,/en/players/antony-dupuis/d272,21.0,/en/scores/2002/451/MS010/match-stats?isLive=F...,jiri-novak:n254,Jiri Novak,4,/en/players/jiri-novak/n254
