This file is intended to be used to process raw csv's downloaded from https://github.com/JeffSackmann.

In [1]:
import os
import csv
from pandas import read_csv

### Initialization

In [2]:
# information to specify before file use
# list of strings so data from multiple years can be combined, ex. of file name: "atp_matches_2019.csv"
tour = "atp"
f_name = [tour+"_matches_2011.csv",tour+"_matches_2012.csv",tour+"_matches_2013.csv"]
f_id = tour+"_players.csv" # under results path below
# name of csv file to return results in
r_name = "2011-2013_"+tour+"_individual.csv"
# fields to extract
f_fields = ["player_id",
            "grass_pts_served","grass_service_pts_won","grass_pts_received","grass_receiving_pts_won","grass_aces",
            "hard_pts_served","hard_service_pts_won","hard_pts_received","hard_receiving_pts_won","hard_aces",
            "clay_pts_served","clay_service_pts_won","clay_pts_received","clay_receiving_pts_won","clay_aces",
            "carpet_pts_served","carpet_service_pts_won","carpet_pts_received","carpet_receiving_pts_won","carpet_aces"]

In [3]:
# get paths
path = os.getcwd()
matches_path = path[:-4]+"New Data\\"+tour+"\\raw"
results_path = path[:-4]+"New Data\\"+tour

In [4]:
id_path = results_path+"\\"+f_id
ids = read_csv(id_path,usecols=["player_id"])["player_id"].tolist()

In [5]:
# use dictionary of dictionaries
# keys of 1st dict is player id, keys of 2nd are the fields specified in f_fields
ovr_dict = {}
for player_id in ids:
    player_dict = {}
    for field in f_fields:
        player_dict[field] = 0
    player_dict["player_id"] = player_id
    ovr_dict[player_id] = player_dict

### Extraction

In [6]:
def update_w_stats(match, fields, ovr_dict):
    """
    helper function for extract_stats below
    extract stats when id is the winner of the match
    """
    
    # indexes of fields needed for statistics
    w_id = fields.index("winner_id")
    surface = fields.index("surface")
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    def my_int(val):
        """
        some of these entries will be missing, so if an empty string is encountered,
        want it just to be 0
        """
        try:
            return(int(val))
        except ValueError:
            return(0)
    
    surface_str = match[surface].lower()
    key = int(match[w_id])
    ovr_dict[key][surface_str+"_pts_served"] += my_int(match[w_svpt])
    ovr_dict[key][surface_str+"_service_pts_won"] += my_int(match[w_1stWon]) + my_int(match[w_2ndWon])
    ovr_dict[key][surface_str+"_pts_received"] += my_int(match[l_svpt])
    ovr_dict[key][surface_str+"_receiving_pts_won"] += my_int(match[l_svpt]) - my_int(match[l_1stWon]) - my_int(match[l_2ndWon])
    ovr_dict[key][surface_str+"_aces"] += my_int(match[w_ace])

        
def update_l_stats(match, fields, ovr_dict):
    """
    helper function for extract_stats below
    extract stats when id is the loser of the match
    """
    
    # indexes of fields needed for statistics
    l_id = fields.index("loser_id")
    surface = fields.index("surface")
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    def my_int(val):
        """
        some of these entries will be missing, so if an empty string is encountered,
        want it just to be 0
        """
        try:
            return(int(val))
        except ValueError:
            return(0)
        
    surface_str = match[surface].lower()
    key = int(match[l_id])
    ovr_dict[key][surface_str+"_pts_served"] += my_int(match[l_svpt])
    ovr_dict[key][surface_str+"_service_pts_won"] += my_int(match[l_1stWon]) + my_int(match[l_2ndWon])
    ovr_dict[key][surface_str+"_pts_received"] += my_int(match[w_svpt])
    ovr_dict[key][surface_str+"_receiving_pts_won"] += my_int(match[w_svpt]) - my_int(match[w_1stWon]) - my_int(match[w_2ndWon])
    ovr_dict[key][surface_str+"_aces"] += my_int(match[l_ace])

In [7]:
def extract_stats(file, ovr_dict):
    """
    function for getting info from 1 csv file
    file: string name of csv file
    ovr_dict: the ovr_dict initialized above
    """
    
    with open(file, 'r') as f:
        csvreader = csv.reader(f)
        fields = next(csvreader) # get&burn first row which are the fields
        for match in csvreader:
            update_w_stats(match,fields,ovr_dict)
            update_l_stats(match,fields,ovr_dict)

In [8]:
# extract stats into ovr_dict
for file in f_name:
    f_path = matches_path+"\\"+file
    extract_stats(f_path, ovr_dict)
    
# write ovr_dict into a single csv for R analysis
r_path = results_path+"\\"+r_name
with open(r_path, 'w', newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=f_fields)
    writer.writeheader()
    writer.writerows(list(ovr_dict.values()))    