This notebook merges the player information in the matches files and the point by point data in the points files. Only fields that will be needed in the extraction notebook will be written to the resulting csv. For now that means the 2 player names, the point server, the point winner and the score of the respective players after the point. More can be added in the future if needed (add to end of f_fields list!).

Note we use names rather than ids because there does not seem to be any consistency between the pbp datasets and the earlier match aggregate sets. Additionally, not all id fields are populated throughout the pbp datasets, while names are, so we can't even use the ids for pbp consolidation.

In [1]:
# information to change per use
# only set we don't have is 2020 wimbledon which was cancelled
year = "2011"
points_file = year+"-frenchopen-points.csv"
matches_file = year+"-frenchopen-matches.csv"
return_file = year+"-frenchopen-pbp.csv"
# surface = "hard" # valid options: "grass","hard","clay"
f_fields = ["P1","P2","match_id","PointServer","PointWinner","P1Score","P2Score","SetNo"] # match_id is the key

In [2]:
import os
import csv

In [3]:
path = os.getcwd() # "...\\Code"
path = path[:-4]+"New Data\\pbp\\" # all involved files use this same path
points_path = path+points_file
matches_path = path+matches_file
return_path = path+return_file
# return_path = path+"set5\\"+return_file

In [4]:
points = [] # each item to be a dict containing info on a single point

# go through points data and keep specified fields
with open(points_path,'r') as f:
    
    csvreader = csv.reader(f)
    fields = next(csvreader)
    
    for row in csvreader:
        # initialize point dict
        # only need first two as rest initialized upon entry below
        
        # this block inserted later to extract 5th set specific points
        # will be commented out after use
        # if row[fields.index("SetNo")]=='5':
        #     continue
        
        point = {}
        point["P1"] = ""
        point["P2"] = ""
        for field in f_fields[2:]:
            point[field] = row[fields.index(field)]
        points.append(point)

In [5]:
# now using match_id fill in "P1","P2"
with open(matches_path,'r') as f:
    
    csvreader = csv.reader(f)
    fields = next(csvreader)
    match_id = fields.index("match_id")
    P1 = fields.index("player1")
    P2 = fields.index("player2")
    i = 0
    for row in csvreader:
        try:
            while row[match_id]==points[i]["match_id"]:
                points[i]["P1"] = row[P1]
                points[i]["P2"] = row[P2]
                i += 1
        except IndexError:
            if i==len(points):
                pass
            else:
                print("Indexing Error")

In [6]:
# no need for match_id anymore
f_fields.remove("match_id")
for point in points:
    point.pop("match_id")

# writing points list
with open(return_path,'w',newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=f_fields)
    writer.writeheader()
    writer.writerows(points)