This file only needs to parse 1 year of training data for the respective test sets, the 3 year versions have already been created for the main study.

In [None]:
import os
import csv
from pandas import read_csv
from copy import deepcopy

In [None]:
tour = "wta"
target_year = 2022
target_name = tour+"_testing_"+str(target_year)+".csv" # file containing matches the data is to be collected for
f_name = tour+"_matches_"+str(target_year-1)+".csv"
f_id = tour+"_players.csv" # under results path below
# name of csv file to return results in
co_name = str(target_year-1)+"_"+tour+"_co.csv"
r_name = str(target_year-1)+"_"+tour+"_individual.csv"
# fields to extract
f_fields = ["player_id",
            "grass_pts_served","grass_service_pts_won","grass_pts_received","grass_receiving_pts_won","grass_aces",
            "hard_pts_served","hard_service_pts_won","hard_pts_received","hard_receiving_pts_won","hard_aces",
            "clay_pts_served","clay_service_pts_won","clay_pts_received","clay_receiving_pts_won","clay_aces",
            "carpet_pts_served","carpet_service_pts_won","carpet_pts_received","carpet_receiving_pts_won","carpet_aces"]
# co2 and co2nsf in the same csv as co, their surfaces being fixed reduces the # of columns they take up
# fields to extract, two of the same to account for the 2 players
co_fields = ["surface"]
for i in range(2): # 1(2) is the winner(loser) in the testing data
    co_fields.append("id"+str(i+1))
    for surface in ["grass","hard","clay","carpet","co2","co2nsf"]: # co2 is the surface in the "surface" field
        for stat in ["_pts_served","_service_pts_won","_pts_received","_receiving_pts_won","_aces","_sample_size"]:
            co_fields.append(surface+stat+str(i+1))

In [None]:
# get paths
path = os.getcwd()
matches_path = path[:-4]+"New Data\\"+tour+"\\raw"
results_path = path[:-4]+"New Data\\misc\\"+tour
target_path = path[:-4]+"New Data\\"+tour+"\\"+target_name # for co to know matches
id_path = path[:-4]+"New Data\\"+tour+"\\"+f_id
ids = read_csv(id_path,usecols=["player_id"])["player_id"].tolist()
# use dictionary of dictionaries
# keys of 1st dict is player id, keys of 2nd are the fields specified in f_fields
ovr_dict = {}
for player_id in ids:
    player_dict = {}
    for field in f_fields:
        player_dict[field] = 0
    player_dict["player_id"] = player_id
    ovr_dict[player_id] = player_dict

In [None]:
# Functions
def my_int(val):
        """
        some of these entries will be missing, so if an empty string is encountered,
        want it just to be 0
        """
        try:
            return(int(val))
        except ValueError:
            return(0)
            
def update_w_stats(match, fields, ovr_dict):
    """
    helper function for extract_stats below
    extract stats when id is the winner of the match
    """
    
    # indexes of fields needed for statistics
    w_id = fields.index("winner_id")
    surface = fields.index("surface")
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    surface_str = match[surface].lower()
    key = int(match[w_id])
    ovr_dict[key][surface_str+"_pts_served"] += my_int(match[w_svpt])
    ovr_dict[key][surface_str+"_service_pts_won"] += my_int(match[w_1stWon]) + my_int(match[w_2ndWon])
    ovr_dict[key][surface_str+"_pts_received"] += my_int(match[l_svpt])
    ovr_dict[key][surface_str+"_receiving_pts_won"] += my_int(match[l_svpt]) - my_int(match[l_1stWon]) - my_int(match[l_2ndWon])
    ovr_dict[key][surface_str+"_aces"] += my_int(match[w_ace])

        
def update_l_stats(match, fields, ovr_dict):
    """
    helper function for extract_stats below
    extract stats when id is the loser of the match
    """
    
    # indexes of fields needed for statistics
    l_id = fields.index("loser_id")
    surface = fields.index("surface")
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
        
    surface_str = match[surface].lower()
    key = int(match[l_id])
    ovr_dict[key][surface_str+"_pts_served"] += my_int(match[l_svpt])
    ovr_dict[key][surface_str+"_service_pts_won"] += my_int(match[l_1stWon]) + my_int(match[l_2ndWon])
    ovr_dict[key][surface_str+"_pts_received"] += my_int(match[w_svpt])
    ovr_dict[key][surface_str+"_receiving_pts_won"] += my_int(match[w_svpt]) - my_int(match[w_1stWon]) - my_int(match[w_2ndWon])
    ovr_dict[key][surface_str+"_aces"] += my_int(match[l_ace])

def extract_stats(file, ovr_dict):
    """
    function for getting info from 1 csv file
    file: string name of csv file
    ovr_dict: the ovr_dict initialized above
    """
    
    with open(file, 'r') as f:
        csvreader = csv.reader(f)
        fields = next(csvreader) # get&burn first row which are the fields
        for match in csvreader:
            update_w_stats(match,fields,ovr_dict)
            update_l_stats(match,fields,ovr_dict)

def update_stats_co(list_idx, match, fields, p1, pw, method="co"):
    """
    mutates ovr_list_co - updates stats for the list_idx'th entry with data in match
    where boolean p1 indicates whether p1 (id1 in the list entry) or p2 is to be updated
    and boolean pw indicates whether the player being updated won or lost the match
    method designates which of ovr_list_co/co2/co2nsf to update
    """
    
    # indexes of fields needed for statistics
    if method=="co":
        surface = match[fields.index("surface")].lower()
    elif method=="co2":
        surface = "co2"
    else: # method=="co2nsf"
        surface = "co2nsf"
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    p = "1" if p1 else "2"
    if pw:
        ovr_list_co[list_idx][surface+"_pts_served"+p] += my_int(match[w_svpt])
        ovr_list_co[list_idx][surface+"_service_pts_won"+p] += (my_int(match[w_1stWon])
                                                                + my_int(match[w_2ndWon]))
        ovr_list_co[list_idx][surface+"_pts_received"+p] += my_int(match[l_svpt])
        ovr_list_co[list_idx][surface+"_receiving_pts_won"+p] += (my_int(match[l_svpt])
                                                                  - my_int(match[l_1stWon])
                                                                  - my_int(match[l_2ndWon]))
        ovr_list_co[list_idx][surface+"_aces"+p] += my_int(match[w_ace])
    else:
        ovr_list_co[list_idx][surface+"_pts_served"+p] += my_int(match[l_svpt])
        ovr_list_co[list_idx][surface+"_service_pts_won"+p] += (my_int(match[l_1stWon])
                                                                + my_int(match[l_2ndWon]))
        ovr_list_co[list_idx][surface+"_pts_received"+p] += my_int(match[w_svpt])
        ovr_list_co[list_idx][surface+"_receiving_pts_won"+p] += (my_int(match[w_svpt])
                                                                  - my_int(match[w_1stWon])
                                                                  - my_int(match[w_2ndWon]))
        ovr_list_co[list_idx][surface+"_aces"+p] += my_int(match[l_ace])
    ovr_list_co[list_idx][surface+"_sample_size"+p] += 1

In [None]:
# CO additional work
ovr_list_co = []

# initializing ovr_list with matches from the target csv file
with open(target_path, 'r') as t:
    
    t_csvreader = csv.reader(t)
    t_fields = next(t_csvreader) # getting fields to identify id columns
    p1_idx = t_fields.index("winner_id")
    p2_idx = t_fields.index("loser_id")
    s_idx = t_fields.index("surface")

    for t_match in t_csvreader:
        p1 = t_match[p1_idx]
        p2 = t_match[p2_idx]
        s = t_match[s_idx]
        # initialize the dict
        t_dict_co = {}
        for field in co_fields:
            t_dict_co[field] = 0
        t_dict_co["id1"], t_dict_co["id2"], t_dict_co["surface"] = p1, p2, s
        ovr_list_co.append(t_dict_co)

"""
need to first get a list of common opponents for each matchup 
this will be done by getting 2 lists of played opponents and taking the intersection for each matchup, 
corresponding 2 lists to be stored in separate lists 
""" 

# initialize the lists of lists
# as before, separate lists for co2 to prevent shallow copies
ovr_p1_oppo = []
ovr_p2_oppo = []
for i in range(len(ovr_list_co)):
    p1_oppo = []
    p2_oppo = []
    ovr_p1_oppo.append(p1_oppo)
    ovr_p2_oppo.append(p2_oppo)
ovr_p1_oppo_co2 = deepcopy(ovr_p1_oppo)
ovr_p2_oppo_co2 = deepcopy(ovr_p2_oppo)
# go through matches and add opponents
f_path = matches_path+"\\"+f_name
with open(f_path,'r') as f:
    csvreader = csv.reader(f)
    fields = next(csvreader)
    winner_idx = fields.index("winner_id")
    loser_idx = fields.index("loser_id")
    s_idx = fields.index("surface")
    for match in csvreader:
        for i in range(len(ovr_list_co)):
            if match[winner_idx]==ovr_list_co[i]["id1"]:
                ovr_p1_oppo[i].append(match[loser_idx])
                if match[s_idx]==ovr_list_co[i]["surface"]:
                    ovr_p1_oppo_co2[i].append(match[loser_idx])
            elif match[loser_idx]==ovr_list_co[i]["id1"]:
                ovr_p1_oppo[i].append(match[winner_idx])
                if match[s_idx]==ovr_list_co[i]["surface"]:
                    ovr_p1_oppo_co2[i].append(match[winner_idx])
            if match[winner_idx]==ovr_list_co[i]["id2"]:
                ovr_p2_oppo[i].append(match[loser_idx])
                if match[s_idx]==ovr_list_co[i]["surface"]:
                    ovr_p2_oppo_co2[i].append(match[loser_idx])
            elif match[loser_idx]==ovr_list_co[i]["id2"]:
                ovr_p2_oppo[i].append(match[winner_idx])
                if match[s_idx]==ovr_list_co[i]["surface"]:
                    ovr_p2_oppo_co2[i].append(match[winner_idx])
                    
# take intersection of each pair of opponent lists for final list of lists
ovr_co_oppo = []
ovr_co2_oppo = []
for i in range(len(ovr_list_co)):
    ovr_co_oppo.append(list(set(ovr_p1_oppo[i]).intersection(ovr_p2_oppo[i])))
    ovr_co2_oppo.append(list(set(ovr_p1_oppo_co2[i]).intersection(ovr_p2_oppo_co2[i])))

In [None]:
f_path = matches_path+"\\"+f_name
with open(f_path,'r') as f:
    csvreader = csv.reader(f)
    fields = next(csvreader)
    winner_idx = fields.index("winner_id")
    loser_idx = fields.index("loser_id")
    surface_idx = fields.index("surface")
    # order of the for loops below cannot(!!!) be reversed - csvreader is not a normal list
    for match in csvreader:
        for i in range(len(ovr_list_co)):
            p1 = ovr_list_co[i]["id1"]
            p2 = ovr_list_co[i]["id2"]
            s = ovr_list_co[i]["surface"]
            # co
            if match[winner_idx]==p1 and match[loser_idx] in ovr_co_oppo[i]:
                update_stats_co(i,match,fields,p1=True,pw=True)
                if match[loser_idx] in ovr_co2_oppo[i]:
                    update_stats_co(i,match,fields,p1=True,pw=True,method="co2nsf")
                    if match[surface_idx]==s:
                        update_stats_co(i,match,fields,p1=True,pw=True,method="co2")
            elif match[loser_idx]==p1 and match[winner_idx] in ovr_co_oppo[i]:
                update_stats_co(i,match,fields,p1=True,pw=False)
                if match[winner_idx] in ovr_co2_oppo[i]:
                    update_stats_co(i,match,fields,p1=True,pw=False,method="co2nsf")
                    if match[surface_idx]==s:
                        update_stats_co(i,match,fields,p1=True,pw=False,method="co2")
            elif match[winner_idx]==p2 and match[loser_idx] in ovr_co_oppo[i]:
                update_stats_co(i,match,fields,p1=False,pw=True)
                if match[loser_idx] in ovr_co2_oppo[i]:
                    update_stats_co(i,match,fields,p1=False,pw=True,method="co2nsf")
                    if match[surface_idx]==s:
                        update_stats_co(i,match,fields,p1=False,pw=True,method="co2")
            elif match[loser_idx]==p2 and match[winner_idx] in ovr_co_oppo[i]:
                update_stats_co(i,match,fields,p1=False,pw=False)
                if match[winner_idx] in ovr_co2_oppo[i]:
                    update_stats_co(i,match,fields,p1=False,pw=False,method="co2nsf")
                    if match[surface_idx]==s:
                        update_stats_co(i,match,fields,p1=False,pw=False,method="co2")

# extract individual stats into ovr_dict
f_path = matches_path+"\\"+f_name
extract_stats(f_path, ovr_dict)

In [None]:
# write ovr_dict into a single csv for R analysis
r_path = results_path+"\\"+r_name
with open(r_path, 'w', newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=f_fields)
    writer.writeheader()
    writer.writerows(list(ovr_dict.values()))

# co,co2,co2nsf
co_path = results_path+"\\"+co_name
with open(co_path, 'w', newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=co_fields)
    writer.writeheader()
    writer.writerows(ovr_list_co)