Taking the pbp csvs from the 'merge and distill' script, extract breakpoint q's underlying data for the breakpoint states: 10, 13, 15, 18.

In [1]:
# change per use
year = "2019-2021"
# pbps = [year+"-wimbledon-pbp.csv",year+"-usopen-pbp.csv",year+"-ausopen-pbp.csv",year+"-frenchopen-pbp.csv"]
result_name = year+"_bpq.csv"
pbps = ["2019-wimbledon-pbp.csv","2019-usopen-pbp.csv","2019-ausopen-pbp.csv","2019-frenchopen-pbp.csv",
        "2021-wimbledon-pbp.csv","2020-usopen-pbp.csv","2020-ausopen-pbp.csv","2020-frenchopen-pbp.csv",
        "2021-usopen-pbp.csv","2021-ausopen-pbp.csv","2021-frenchopen-pbp.csv"]

In [2]:
import os
import csv

In [3]:
path = os.getcwd() # "...\\Code" parallel with "Data Lake"
source_path_pre = path[:-4]+"Data Lake\\Point-By-Point\\"
result_path = path[:-4]+"Data Lake\\Players"+"\\"+result_name
# result_path = path+"\\prediction_notebook_6\\"+result_name

In [4]:
states = ["t"] + [str(s) for s in range(1,19)] # "t" for tiebreak
bp_states = ['10','13','15','18']
surfaces = ["grass","hard","clay"]
tallies = ["receive_total","receive_won"]

f_fields = [state+"_"+surface+"_"+tally for state in states for surface in surfaces for tally in tallies]

def init_player_dict(player,ovr_dict):
    p_dict = {}
    p_dict["name"] = player
    for field in f_fields:
        p_dict[field] = 0
    ovr_dict[player] = p_dict

In [5]:
def determine_previous_state(s_score, r_score, s_won, prev_s_score, prev_r_score):
    """
    Given server's score s_score, receiver's score r_score and if server won s_won,
    returns the previous score's corresponding state number
    For state-score correspondence, see point to game matrix
    """
    # someone won the game
    # 'GAME' conditions added later for frenchopen data
    if (s_score=='0' and r_score=='0') or s_score=='GAME' or r_score=='GAME': 
        if prev_s_score=='40' and prev_r_score=='0':
            return '7'
        elif prev_s_score=='40' and prev_r_score=='15':
            return '11'
        elif prev_s_score=='40' and prev_r_score=='30':
            return '14'
        elif prev_s_score=='AD' and prev_r_score=='40':
            return '17'
        elif prev_s_score=='0' and prev_r_score=='40':
            return '10'
        elif prev_s_score=='15' and prev_r_score=='40':
            return '13'
        elif prev_s_score=='30' and prev_r_score=='40':
            return '15'
        elif prev_s_score=='40' and prev_r_score=='AD':
            return '18'
        else: # someone won the tiebreak
            return 't'
    
    if s_won:
        if s_score=='15' and r_score=='0':
            return '1'
        elif s_score=='30' and r_score=='0':
            return '2'
        elif s_score=='15' and r_score=='15':
            return '3'
        elif s_score=='40' and r_score=='0':
            return '4'
        elif s_score=='30' and r_score=='15':
            return '5'
        elif s_score=='15' and r_score=='30':
            return '6'
        elif s_score=='40' and r_score=='15':
            return '8'
        elif s_score=='30' and r_score=='30':
            return '9'
        elif s_score=='15' and r_score=='40':
            return '10'
        elif s_score=='40' and r_score=='30':
            return '12'
        elif s_score=='30' and r_score=='40':
            return '13'
        elif s_score=='40' and r_score=='40': # Deuce is special
            if prev_s_score=='30':
                return '15'
            else: # Coming back from returner advantage
                return '18'
        elif s_score=='AD' and r_score=='40':
            return '16'
        else: # tiebreak situation
            return 't'
    else:
        if s_score=='0' and r_score=='15':
            return '1'
        elif s_score=='15' and r_score=='15':
            return '2'
        elif s_score=='0' and r_score=='30':
            return '3'
        elif s_score=='30' and r_score=='15':
            return '4'
        elif s_score=='15' and r_score=='30':
            return '5'
        elif s_score=='0' and r_score=='40':
            return '6'
        elif s_score=='40' and r_score=='15':
            return '7'
        elif s_score=='30' and r_score=='30':
            return '8'
        elif s_score=='15' and r_score=='40':
            return '9'
        elif s_score=='40' and r_score=='30':
            return '11'
        elif s_score=='30' and r_score=='40':
            return '12'
        elif s_score=='40' and r_score=='40':
            if prev_s_score=='40':
                return '14'
            else:
                return '17'
        elif s_score=='40' and r_score=='AD':
            return '16'
        else: # tiebreak situation
            return 't'

In [6]:
ovr_dict = {}
for pbp in pbps:
    source_path = source_path_pre+pbp
    with open(source_path,'r') as f:

        csvreader = csv.reader(f)
        fields = next(csvreader)
        p1_idx = fields.index("P1")
        p2_idx = fields.index("P2")
        surface_idx = fields.index("Surface")
        server_idx = fields.index("PointServer")
        winner_idx = fields.index("PointWinner")
        p1_score_idx = fields.index("P1Score")
        p2_score_idx = fields.index("P2Score")

        start = next(csvreader)
        surface = start[surface_idx] # surface same per file
        prev_p1_score = start[p1_score_idx]
        prev_p2_score = start[p2_score_idx]
        prev_p1 = "X Name"
        prev_p2 = "X Name" # change if actually is a player called X Name

        for row in csvreader:
            server = row[server_idx]
            p1 = row[p1_idx]
            p2 = row[p2_idx]
            # due to server=='0' not being able to pick up french open new matches,
            # both players also checked for each row to see if a new match is played
            # server=='0' condition still kept as it helps skip empty rows for other tournaments
            if server=='0': # new match
                prev_p1 = p1
                prev_p2 = p2
                if p1 not in ovr_dict:
                    init_player_dict(p1,ovr_dict)
                if p2 not in ovr_dict:
                    init_player_dict(p2,ovr_dict)
                continue
            if prev_p1!=p1 or prev_p2!=p2: # new match in french open datasets
                if p1 not in ovr_dict:
                    init_player_dict(p1,ovr_dict)
                if p2 not in ovr_dict:
                    init_player_dict(p2,ovr_dict)
            winner = row[winner_idx]
            p1_score = row[p1_score_idx]
            p2_score = row[p2_score_idx]
            if server=='1' and winner=='1':
                state = determine_previous_state(p1_score,p2_score,True,prev_p1_score,prev_p2_score)
                if state in bp_states:
                    ovr_dict[p2][state+"_"+surface+"_"+"receive_total"] += 1
            elif server=='1' and winner=='2':
                state = determine_previous_state(p1_score,p2_score,False,prev_p1_score,prev_p2_score)
                if state in bp_states:
                    ovr_dict[p2][state+"_"+surface+"_"+"receive_total"] += 1
                    ovr_dict[p2][state+"_"+surface+"_"+"receive_won"] += 1
            elif server=='2' and winner=='1':
                state = determine_previous_state(p2_score,p1_score,False,prev_p2_score,prev_p1_score)
                if state in bp_states:
                    ovr_dict[p1][state+"_"+surface+"_"+"receive_total"] += 1
                    ovr_dict[p1][state+"_"+surface+"_"+"receive_won"] += 1
            elif server=='2' and winner=='2':
                state = determine_previous_state(p2_score,p1_score,True,prev_p2_score,prev_p1_score)
                if state in bp_states:
                    ovr_dict[p1][state+"_"+surface+"_"+"receive_total"] += 1
            prev_p1_score = p1_score
            prev_p2_score = p2_score
            prev_p1 = p1
            prev_p2 = p2

In [7]:
with open(result_path,'w',newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["name"]+f_fields)
    writer.writeheader()
    writer.writerows(list(ovr_dict.values()))   