Extraction of head-to-head (H2H) data targeting the 2022 testing datasets, as well as common-opponent (CO) data. The data is stored in separate files. By construction, for the same matchup, the h2h and co datasets will be disjoint.

This new version does extraction for CO2 and CO2NSF as well.

### Initialization

In [1]:
# information to specify before running file (just first 2, rest is automated)
tour = "atp"
target_year = 2022
target_name = tour+"_testing_"+str(target_year)+".csv" # file containing matches the data is to be collected for
f_names = [] # corresponding three year horizon to collect h2h, co, co2 data
for i in range(3):
    f_names.append(tour+"_matches_"+str(target_year-1-i)+".csv")
# result files
h2h_name = str(target_year-3)+"-"+str(target_year-1)+"_"+tour+"_h2h.csv" # e.g., "2011-2013_wta_h2h.csv"
co_name = str(target_year-3)+"-"+str(target_year-1)+"_"+tour+"_co.csv"
# co2 and co2nsf in the same csv as co, their surfaces being fixed reduces the # of columns they take up
# fields to extract, two of the same to account for the 2 players
f_fields = ["surface"]
for i in range(2): # 1(2) is the winner(loser) in the testing data
    f_fields.append("id"+str(i+1))
    for surface in ["grass","hard","clay","carpet","co2","co2nsf"]: # co2 is the surface in the "surface" field
        for stat in ["_pts_served","_service_pts_won","_pts_received","_receiving_pts_won","_aces","_sample_size"]:
            f_fields.append(surface+stat+str(i+1))

In [2]:
import os
import csv
from copy import deepcopy

In [3]:
# get paths
path = os.getcwd() # "...\\Code" parallel with "New Data"
matches_path = path[:-4]+"New Data\\"+tour+"\\raw" # folder containing matches to mine
players_path = path[:-4]+"New Data\\"+tour # folder to store all extracted stats
target_path = path[:-4]+"New Data\\"+tour # folder containing testing targets
t_path = target_path+"\\"+target_name
h2h_path = players_path+"\\"+h2h_name
co_path = players_path+"\\"+co_name

In [4]:
# each match's player stats to be stored in a dict in an overall list
# list will be go by order of matches in the target file
ovr_list_h2h = []
# deep copies later for co and co2

# initializing ovr_list with matches from the target csv file
with open(t_path, 'r') as t:
    
    t_csvreader = csv.reader(t)
    t_fields = next(t_csvreader) # getting fields to identify id columns
    p1_idx = t_fields.index("winner_id")
    p2_idx = t_fields.index("loser_id")
    s_idx = t_fields.index("surface")

    for t_match in t_csvreader:
        p1 = t_match[p1_idx]
        p2 = t_match[p2_idx]
        s = t_match[s_idx]
        # initialize the dict
        t_dict_h2h = {}
        for field in f_fields:
            t_dict_h2h[field] = 0
        t_dict_h2h["id1"], t_dict_h2h["id2"], t_dict_h2h["surface"] = p1, p2, s
        ovr_list_h2h.append(t_dict_h2h)
ovr_list_co = deepcopy(ovr_list_h2h)

### Functions for Extraction

In [5]:
def my_int(val):
        """
        some of these entries will be missing, so if an empty string is encountered,
        want it just to be 0
        """
        try:
            return(int(val))
        except ValueError:
            return(0)
        
def update_stats_h2h(list_idx, match, fields, p1w):
    """
    mutates ovr_list_h2h - updates stats for the list_idx's entry with data in match
    where p1w indicates whether p1 (name1 in the list entry) was the winner
    """
    
    w,l = str(2-p1w),str(1+p1w) # w=1, l=2 if p1w, p1 won; w=2, l=1 otherwise
    # indexes of fields needed for statistics
    surface = match[fields.index("surface")].lower()
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    ovr_list_h2h[list_idx][surface+"_pts_served"+w] += my_int(match[w_svpt])
    ovr_list_h2h[list_idx][surface+"_service_pts_won"+w] += my_int(match[w_1stWon]) + my_int(match[w_2ndWon])
    ovr_list_h2h[list_idx][surface+"_pts_received"+w] += my_int(match[l_svpt])
    ovr_list_h2h[list_idx][surface+"_receiving_pts_won"+w] += (my_int(match[l_svpt]) - my_int(match[l_1stWon])
                                                               - my_int(match[l_2ndWon]))
    ovr_list_h2h[list_idx][surface+"_aces"+w] += my_int(match[w_ace])
    ovr_list_h2h[list_idx][surface+"_sample_size"+w] += 1
    ovr_list_h2h[list_idx][surface+"_pts_served"+l] += my_int(match[l_svpt])
    ovr_list_h2h[list_idx][surface+"_service_pts_won"+l] += my_int(match[l_1stWon]) + my_int(match[l_2ndWon])
    ovr_list_h2h[list_idx][surface+"_pts_received"+l] += my_int(match[w_svpt])
    ovr_list_h2h[list_idx][surface+"_receiving_pts_won"+l] += (my_int(match[w_svpt]) - my_int(match[w_1stWon])
                                                               - my_int(match[w_2ndWon]))
    ovr_list_h2h[list_idx][surface+"_aces"+l] += my_int(match[l_ace])
    ovr_list_h2h[list_idx][surface+"_sample_size"+l] += 1
    
# now we can do something similar to the H2H data extraction above
# but this time the data of the common opponents isn't extracted
# so the update_stats function will be modified
def update_stats_co(list_idx, match, fields, p1, pw, method="co"):
    """
    mutates ovr_list_co - updates stats for the list_idx'th entry with data in match
    where boolean p1 indicates whether p1 (id1 in the list entry) or p2 is to be updated
    and boolean pw indicates whether the player being updated won or lost the match
    method designates which of ovr_list_co/co2/co2nsf to update
    """
    
    # indexes of fields needed for statistics
    if method=="co":
        surface = match[fields.index("surface")].lower()
    elif method=="co2":
        surface = "co2"
    else: # method=="co2nsf"
        surface = "co2nsf"
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    p = "1" if p1 else "2"
    if pw:
        ovr_list_co[list_idx][surface+"_pts_served"+p] += my_int(match[w_svpt])
        ovr_list_co[list_idx][surface+"_service_pts_won"+p] += (my_int(match[w_1stWon])
                                                                + my_int(match[w_2ndWon]))
        ovr_list_co[list_idx][surface+"_pts_received"+p] += my_int(match[l_svpt])
        ovr_list_co[list_idx][surface+"_receiving_pts_won"+p] += (my_int(match[l_svpt])
                                                                  - my_int(match[l_1stWon])
                                                                  - my_int(match[l_2ndWon]))
        ovr_list_co[list_idx][surface+"_aces"+p] += my_int(match[w_ace])
    else:
        ovr_list_co[list_idx][surface+"_pts_served"+p] += my_int(match[l_svpt])
        ovr_list_co[list_idx][surface+"_service_pts_won"+p] += (my_int(match[l_1stWon])
                                                                + my_int(match[l_2ndWon]))
        ovr_list_co[list_idx][surface+"_pts_received"+p] += my_int(match[w_svpt])
        ovr_list_co[list_idx][surface+"_receiving_pts_won"+p] += (my_int(match[w_svpt])
                                                                  - my_int(match[w_1stWon])
                                                                  - my_int(match[w_2ndWon]))
        ovr_list_co[list_idx][surface+"_aces"+p] += my_int(match[l_ace])
    ovr_list_co[list_idx][surface+"_sample_size"+p] += 1

### Common Opponent Lists

In [6]:
"""
need to first get a list of common opponents for each matchup 
this will be done by getting 2 lists of played opponents and taking the intersection for each matchup, 
corresponding 2 lists to be stored in separate lists 
""" 
# initialize the lists of lists
# as before, separate lists for co2 to prevent shallow copies
ovr_p1_oppo = []
ovr_p2_oppo = []
for i in range(len(ovr_list_co)):
    p1_oppo = []
    p2_oppo = []
    ovr_p1_oppo.append(p1_oppo)
    ovr_p2_oppo.append(p2_oppo)
ovr_p1_oppo_co2 = deepcopy(ovr_p1_oppo)
ovr_p2_oppo_co2 = deepcopy(ovr_p2_oppo)
# go through matches and add opponents
for file in f_names:
    f_path = matches_path+"\\"+file
    with open(f_path,'r') as f:
        csvreader = csv.reader(f)
        fields = next(csvreader)
        winner_idx = fields.index("winner_id")
        loser_idx = fields.index("loser_id")
        s_idx = fields.index("surface")
        for match in csvreader:
            for i in range(len(ovr_list_co)):
                if match[winner_idx]==ovr_list_co[i]["id1"]:
                    ovr_p1_oppo[i].append(match[loser_idx])
                    if match[s_idx]==ovr_list_co[i]["surface"]:
                        ovr_p1_oppo_co2[i].append(match[loser_idx])
                elif match[loser_idx]==ovr_list_co[i]["id1"]:
                    ovr_p1_oppo[i].append(match[winner_idx])
                    if match[s_idx]==ovr_list_co[i]["surface"]:
                        ovr_p1_oppo_co2[i].append(match[winner_idx])
                if match[winner_idx]==ovr_list_co[i]["id2"]:
                    ovr_p2_oppo[i].append(match[loser_idx])
                    if match[s_idx]==ovr_list_co[i]["surface"]:
                        ovr_p2_oppo_co2[i].append(match[loser_idx])
                elif match[loser_idx]==ovr_list_co[i]["id2"]:
                    ovr_p2_oppo[i].append(match[winner_idx])
                    if match[s_idx]==ovr_list_co[i]["surface"]:
                        ovr_p2_oppo_co2[i].append(match[winner_idx])
                    
# take intersection of each pair of opponent lists for final list of lists
ovr_co_oppo = []
ovr_co2_oppo = []
for i in range(len(ovr_list_co)):
    ovr_co_oppo.append(list(set(ovr_p1_oppo[i]).intersection(ovr_p2_oppo[i])))
    ovr_co2_oppo.append(list(set(ovr_p1_oppo_co2[i]).intersection(ovr_p2_oppo_co2[i])))

In [7]:
# we can see the lists of common opponents produced, for example:
k = 2
print("Common opponents for "+ovr_list_co[k]["id1"]+" and "+ovr_list_co[k]["id2"]+":")
ovr_co2_oppo[k]

Common opponents for 106214 and 202358:


[]

### Extraction

In [8]:
for file in f_names:
    f_path = matches_path+"\\"+file
    with open(f_path,'r') as f:
        csvreader = csv.reader(f)
        fields = next(csvreader)
        winner_idx = fields.index("winner_id")
        loser_idx = fields.index("loser_id")
        surface_idx = fields.index("surface")
        # order of the for loops below cannot(!!!) be reversed - csvreader is not a normal list
        for match in csvreader:
            for i in range(len(ovr_list_h2h)):
                p1 = ovr_list_h2h[i]["id1"]
                p2 = ovr_list_h2h[i]["id2"]
                s = ovr_list_h2h[i]["surface"]
                # h2h
                if match[winner_idx]==p1 and match[loser_idx]==p2:
                    update_stats_h2h(i,match,fields,p1w=True)
                elif match[loser_idx]==p1 and match[winner_idx]==p2:
                    update_stats_h2h(i,match,fields,p1w=False)
                # co
                elif match[winner_idx]==p1 and match[loser_idx] in ovr_co_oppo[i]:
                    update_stats_co(i,match,fields,p1=True,pw=True)
                    if match[loser_idx] in ovr_co2_oppo[i]:
                        update_stats_co(i,match,fields,p1=True,pw=True,method="co2nsf")
                        if match[surface_idx]==s:
                            update_stats_co(i,match,fields,p1=True,pw=True,method="co2")
                elif match[loser_idx]==p1 and match[winner_idx] in ovr_co_oppo[i]:
                    update_stats_co(i,match,fields,p1=True,pw=False)
                    if match[winner_idx] in ovr_co2_oppo[i]:
                        update_stats_co(i,match,fields,p1=True,pw=False,method="co2nsf")
                        if match[surface_idx]==s:
                            update_stats_co(i,match,fields,p1=True,pw=False,method="co2")
                elif match[winner_idx]==p2 and match[loser_idx] in ovr_co_oppo[i]:
                    update_stats_co(i,match,fields,p1=False,pw=True)
                    if match[loser_idx] in ovr_co2_oppo[i]:
                        update_stats_co(i,match,fields,p1=False,pw=True,method="co2nsf")
                        if match[surface_idx]==s:
                            update_stats_co(i,match,fields,p1=False,pw=True,method="co2")
                elif match[loser_idx]==p2 and match[winner_idx] in ovr_co_oppo[i]:
                    update_stats_co(i,match,fields,p1=False,pw=False)
                    if match[winner_idx] in ovr_co2_oppo[i]:
                        update_stats_co(i,match,fields,p1=False,pw=False,method="co2nsf")
                        if match[surface_idx]==s:
                            update_stats_co(i,match,fields,p1=False,pw=False,method="co2")

### Write Results

In [9]:
# h2h
with open(h2h_path, 'w', newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=f_fields)
    writer.writeheader()
    writer.writerows(ovr_list_h2h)

In [10]:
# co,co2,co2nsf
with open(co_path, 'w', newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=f_fields)
    writer.writeheader()
    writer.writerows(ovr_list_co)