In the previous common opponent (co) extraction file, the order of operations was to first obtain a list of common opponents for a match-up between A and B, and then extract surface-specific stats for matches between A/B and their common opponents. This file instead follows more closely the approach originally proposed by Knottenbelt et al.'s in their 2012 paper. Now the surface of the target match-up is introduced earlier when the common opponent list is being compiled, and when stats are being extracted, entries are only used if they fit the surface in question. So the amount of data used is restricted during list compilation and data extraction such that the aggregated data is purely based on the surface. Therefore we should expect results from this file to be of a smaller volume.

### Initialization

In [1]:
# information to specify before running file
target_name = "2014_wta_testing.csv" # file containing matches the data is to be collected for
# list of strings so data from multiple years can be combined, ex. of file name: "atp_matches_2019.csv"
f_names = ["wta_matches_2013.csv"]#,"wta_matches_2012.csv","wta_matches_2011.csv"]
co_name = "2013_wta_co2.csv" # name of file to store extracted co data in
# fields to extract, two of the same to account for the 2 players
# !NEW! now as surface is distinguished from the beginning, each matchup only needs stats for their
# relevant surface
f_fields = ["surface","name1","pts_served1","service_pts_won1","pts_received1",
            "receiving_pts_won1","aces1","name2","pts_served2","service_pts_won2",
            "pts_received2","receiving_pts_won2","aces2","sample_size"]

In [2]:
import os
import csv

In [3]:
# get paths
path = os.getcwd() # "...\\Code" parallel with "Data Lake"
matches_path = path[:-4]+"Data Lake\\Matches" # folder containing matches to mine
players_path = path[:-4]+"Data Lake\\Players" # folder to store all extracted stats
target_path = path[:-4]+"Data Lake\\Experiments" # folder containing testing targets
t_path = target_path+"\\"+target_name
co_path = players_path+"\\"+co_name

In [4]:
# each match's player stats to be stored in a dict in an overall list
# list will be go by order of matches in the target file
# !NEW! note this time we have to accomodate for surface
ovr_list_co = []

# initializing ovr_list with matches from the target csv file
with open(t_path, 'r') as t:
    
    t_csvreader = csv.reader(t)
    t_fields = next(t_csvreader) # getting fields to identify name columns
    p1_idx = t_fields.index("winner_name")
    p2_idx = t_fields.index("loser_name")
    s_idx = t_fields.index("surface")
    
    for t_match in t_csvreader:
        p1 = t_match[p1_idx]
        p2 = t_match[p2_idx]
        surface = t_match[s_idx]
        
        # initialize the dict
        t_dict = {}
        for field in f_fields:
            t_dict[field] = 0 
        t_dict["name1"] = p1
        t_dict["name2"] = p2
        t_dict["surface"] = surface
        
        ovr_list_co.append(t_dict)

In [5]:
# helper functions
def my_int(val):
        """
        some of these entries will be missing, so if an empty string is encountered,
        want it just to be 0
        """
        try:
            return(int(val))
        except ValueError:
            return(0)
        
def update_stats_co(list_idx, match, fields, p1, pw):
    """
    mutates ovr_list_co - updates stats for the list_idx'th entry with data in match
    where boolean p1 indicates whether p1 (name1 in the list entry) or p2 is to be updated
    and boolean pw indicates whether the player being updated won or lost the match
    """
    
    # indexes of fields needed for statistics
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    if p1:
        if pw:
            ovr_list_co[list_idx]["pts_served1"] += my_int(match[w_svpt])
            ovr_list_co[list_idx]["service_pts_won1"] += (my_int(match[w_1stWon]) + 
                                                          my_int(match[w_2ndWon]))
            ovr_list_co[list_idx]["pts_received1"] += my_int(match[l_svpt])
            ovr_list_co[list_idx]["receiving_pts_won1"] += (my_int(match[l_svpt]) - 
                                                            my_int(match[l_1stWon]) - 
                                                            my_int(match[l_2ndWon]))
            ovr_list_co[list_idx]["aces1"] += my_int(match[w_ace])
        else:
            ovr_list_co[list_idx]["pts_served1"] += my_int(match[l_svpt])
            ovr_list_co[list_idx]["service_pts_won1"] += (my_int(match[l_1stWon]) + 
                                                          my_int(match[l_2ndWon]))
            ovr_list_co[list_idx]["pts_received1"] += my_int(match[w_svpt])
            ovr_list_co[list_idx]["receiving_pts_won1"] += (my_int(match[w_svpt]) - 
                                                            my_int(match[w_1stWon]) - 
                                                            my_int(match[w_2ndWon]))
            ovr_list_co[list_idx]["aces1"] += my_int(match[l_ace])
    else:
        if pw:
            ovr_list_co[list_idx]["pts_served2"] += my_int(match[w_svpt])
            ovr_list_co[list_idx]["service_pts_won2"] += (my_int(match[w_1stWon]) + 
                                                          my_int(match[w_2ndWon]))
            ovr_list_co[list_idx]["pts_received2"] += my_int(match[l_svpt])
            ovr_list_co[list_idx]["receiving_pts_won2"] += (my_int(match[l_svpt]) - 
                                                            my_int(match[l_1stWon]) - 
                                                            my_int(match[l_2ndWon]))
            ovr_list_co[list_idx]["aces2"] += my_int(match[w_ace])
        else:
            ovr_list_co[list_idx]["pts_served2"] += my_int(match[l_svpt])
            ovr_list_co[list_idx]["service_pts_won2"] += (my_int(match[l_1stWon]) + 
                                                          my_int(match[l_2ndWon]))
            ovr_list_co[list_idx]["pts_received2"] += my_int(match[w_svpt])
            ovr_list_co[list_idx]["receiving_pts_won2"] += (my_int(match[w_svpt]) - 
                                                            my_int(match[w_1stWon]) - 
                                                            my_int(match[w_2ndWon]))
            ovr_list_co[list_idx]["aces2"] += my_int(match[l_ace])
    ovr_list_co[list_idx]["sample_size"] += 1

In [6]:
# need to first get a list of common opponents for each matchup
# this will be done by getting 2 lists of played opponents 
# and taking the intersection
# for each matchup, corresponding 2 lists to be stored in separate lists
# initialize the lists of lists
ovr_p1_oppo = []
ovr_p2_oppo = []
for i in range(len(ovr_list_co)):
    p1_oppo = []
    p2_oppo = []
    ovr_p1_oppo.append(p1_oppo)
    ovr_p2_oppo.append(p2_oppo)

# go through matches and add opponents
for file in f_names:
    f_path = matches_path+"\\"+file
    with open(f_path,'r') as f:
        csvreader = csv.reader(f)
        fields = next(csvreader)
        winner_idx = fields.index("winner_name")
        loser_idx = fields.index("loser_name")
        s_idx = fields.index("surface")
        for match in csvreader:
            for i in range(len(ovr_list_co)):
                if match[s_idx]!=ovr_list_co[i]["surface"]:
                    continue
                if match[winner_idx]==ovr_list_co[i]["name1"]:
                    ovr_p1_oppo[i].append(match[loser_idx])
                elif match[loser_idx]==ovr_list_co[i]["name1"]:
                    ovr_p1_oppo[i].append(match[winner_idx])
                if match[winner_idx]==ovr_list_co[i]["name2"]:
                    ovr_p2_oppo[i].append(match[loser_idx])
                elif match[loser_idx]==ovr_list_co[i]["name2"]:
                    ovr_p2_oppo[i].append(match[winner_idx])
                    
# take intersection of each pair of opponent lists for final list of lists
ovr_co_oppo = []
for i in range(len(ovr_list_co)):
    ovr_co_oppo.append(list(set(ovr_p1_oppo[i]).intersection(ovr_p2_oppo[i])))

In [7]:
# we can see the lists of common opponents produced, for example:
k = 96
print("Common opponents for "+ovr_list_co[k]["name1"]+" and "+ovr_list_co[k]["name2"]+":")
ovr_co_oppo[k]

Common opponents for Sara Errani and Petra Cetkovska:


['Roberta Vinci']

In [8]:
for file in f_names:
    f_path = matches_path+"\\"+file
    with open(f_path,'r') as f:
        csvreader = csv.reader(f)
        fields = next(csvreader)
        winner_idx = fields.index("winner_name")
        loser_idx = fields.index("loser_name")
        s_idx = fields.index("surface")
        
        for match in csvreader:
            for i in range(len(ovr_list_co)):
                if match[s_idx]!=ovr_list_co[i]["surface"]:
                    continue
                p1 = ovr_list_co[i]["name1"]
                p2 = ovr_list_co[i]["name2"]
                if match[winner_idx]==p1 and match[loser_idx] in ovr_co_oppo[i]:
                    update_stats_co(i,match,fields,p1=True,pw=True)
                elif match[loser_idx]==p1 and match[winner_idx] in ovr_co_oppo[i]:
                    update_stats_co(i,match,fields,p1=True,pw=False)
                elif match[winner_idx]==p2 and match[loser_idx] in ovr_co_oppo[i]:
                    update_stats_co(i,match,fields,p1=False,pw=True)
                elif match[loser_idx]==p2 and match[winner_idx] in ovr_co_oppo[i]:
                    update_stats_co(i,match,fields,p1=False,pw=False)

In [9]:
# write ovr_dict into a single csv for R analysis
with open(co_path, 'w', newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=f_fields)
    writer.writeheader()
    writer.writerows(ovr_list_co)