This file is intended to be used to process raw csv's downloaded from https://github.com/JeffSackmann.

### Initialization

In [1]:
# information to specify before file use
# list of strings so data from multiple years can be combined, ex. of file name: "atp_matches_2019.csv"
f_name = ["wta_matches_2013.csv","wta_matches_2012.csv","wta_matches_2011.csv"]
# name of csv file to return results in
r_name = "2011-2013_wta_individual.csv"
# list of players obtained from quick r code in scrap
players = ["Serena Williams","Samantha Stosur","Roberta Vinci","Svetlana Kuznetsova","Maria Sharapova","Ana Ivanovic","Simona Halep","Petra Kvitova","Caroline Garcia","Agnieszka Radwanska","Na Li","Dominika Cibulkova","Flavia Pennetta","Sloane Stephens","Jelena Jankovic","Angelique Kerber","Caroline Wozniacki","Carla Suarez Navarro","Venus Williams","Victoria Azarenka","Coco Vandeweghe","Kate Makarova","Alize Cornet","Eugenie Bouchard","Timea Bacsinszky","Elina Svitolina","Shuai Zhang","Sara Errani","Petra Cetkovska","Yanina Wickmayer","Garbine Muguruza","Andrea Petkovic","Sabine Lisicki","Lucie Safarova","Barbora Strycova","Belinda Bencic","Shuai Peng","Anastasia Pavlyuchenkova","Aleksandra Wozniak","Camila Giorgi","Alisa Kleybanova","Casey Dellacqua","Kirsten Flipkens","Varvara Lepchenko","Shelby Rogers","Heather Watson","Alison Riske Amritraj","Karolina Pliskova","Christina Mchale","Francesca Schiavone","Klara Koukalova","Annika Beck","Monica Niculescu","Jana Cepelova","Pauline Parmentier","Ajla Tomljanovic","Kiki Bertens","Zarina Diyas","Yaroslava Shvedova","Tereza Smitkova","Kaia Kanepi","Aleksandra Krunic","Mirjana Lucic"]
# fields to extract
f_fields = ["name","grass_pts_served","grass_service_pts_won","grass_pts_received","grass_receiving_pts_won","grass_aces",
           "hard_pts_served","hard_service_pts_won","hard_pts_received","hard_receiving_pts_won","hard_aces",
           "clay_pts_served","clay_service_pts_won","clay_pts_received","clay_receiving_pts_won","clay_aces"]

In [2]:
# make dictionary of dictionaries
# keys of 1st dict will be player name
# keys of 2nd dict will be name, grass_pts_served, grass_service_pts_won, grass_pts_received, 
#   grass_receiving_pts_won, grass_aces, and then same convention for hard and clay for a total of 16 keys
ovr_dict = {}
for player in players:
    player_dict = {}
    for field in f_fields:
        player_dict[field] = 0
    player_dict['name'] = player
    ovr_dict[player] = player_dict

### Extraction

In [3]:
import os
import csv

In [4]:
# get paths
path = os.getcwd()
matches_path = path[:-4]+"Data Lake\\Matches"
players_path = path[:-4]+"Data Lake\\Players"

In [5]:
def update_w_stats(match, fields, name, ovr_dict):
    """
    helper function for extract_stats below
    extract stats when name is the winner of the match
    """
    
    # indexes of fields needed for statistics
    surface = fields.index("surface")
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    def my_int(val):
        """
        some of these entries will be missing, so if an empty string is encountered,
        want it just to be 0
        """
        try:
            return(int(val))
        except ValueError:
            return(0)
    
    # update according to surface
    if match[surface] == "Grass":
        ovr_dict[name]["grass_pts_served"] += my_int(match[w_svpt])
        ovr_dict[name]["grass_service_pts_won"] += my_int(match[w_1stWon]) + my_int(match[w_2ndWon])
        ovr_dict[name]["grass_pts_received"] += my_int(match[l_svpt])
        ovr_dict[name]["grass_receiving_pts_won"] += my_int(match[l_svpt]) - my_int(match[l_1stWon]) - my_int(match[l_2ndWon])
        ovr_dict[name]["grass_aces"] += my_int(match[w_ace])
    elif match[surface] == "Hard":
        ovr_dict[name]["hard_pts_served"] += my_int(match[w_svpt])
        ovr_dict[name]["hard_service_pts_won"] += my_int(match[w_1stWon]) + my_int(match[w_2ndWon])
        ovr_dict[name]["hard_pts_received"] += my_int(match[l_svpt])
        ovr_dict[name]["hard_receiving_pts_won"] += my_int(match[l_svpt]) - my_int(match[l_1stWon]) - my_int(match[l_2ndWon])
        ovr_dict[name]["hard_aces"] += my_int(match[w_ace])
    elif match[surface] == "Clay":
        ovr_dict[name]["clay_pts_served"] += my_int(match[w_svpt])
        ovr_dict[name]["clay_service_pts_won"] += my_int(match[w_1stWon]) + my_int(match[w_2ndWon])
        ovr_dict[name]["clay_pts_received"] += my_int(match[l_svpt])
        ovr_dict[name]["clay_receiving_pts_won"] += my_int(match[l_svpt]) - my_int(match[l_1stWon]) - my_int(match[l_2ndWon])
        ovr_dict[name]["clay_aces"] += my_int(match[w_ace])

        
def update_l_stats(match, fields, name, ovr_dict):
    """
    helper function for extract_stats below
    extract stats when name is the loser of the match
    """
    
    # indexes of fields needed for statistics
    surface = fields.index("surface")
    w_ace = fields.index("w_ace")
    w_svpt = fields.index("w_svpt")
    w_1stWon = fields.index("w_1stWon")
    w_2ndWon = fields.index("w_2ndWon")
    l_ace = fields.index("l_ace")
    l_svpt = fields.index("l_svpt")
    l_1stWon = fields.index("l_1stWon")
    l_2ndWon = fields.index("l_2ndWon")
    
    def my_int(val):
        """
        some of these entries will be missing, so if an empty string is encountered,
        want it just to be 0
        """
        try:
            return(int(val))
        except ValueError:
            return(0)
    
    # update according to surface
    if match[surface] == "Grass":
        ovr_dict[name]["grass_pts_served"] += my_int(match[l_svpt])
        ovr_dict[name]["grass_service_pts_won"] += my_int(match[l_1stWon]) + my_int(match[l_2ndWon])
        ovr_dict[name]["grass_pts_received"] += my_int(match[w_svpt])
        ovr_dict[name]["grass_receiving_pts_won"] += my_int(match[w_svpt]) - my_int(match[w_1stWon]) - my_int(match[w_2ndWon])
        ovr_dict[name]["grass_aces"] += my_int(match[l_ace])
    elif match[surface] == "Hard":
        ovr_dict[name]["hard_pts_served"] += my_int(match[l_svpt])
        ovr_dict[name]["hard_service_pts_won"] += my_int(match[l_1stWon]) + my_int(match[l_2ndWon])
        ovr_dict[name]["hard_pts_received"] += my_int(match[w_svpt])
        ovr_dict[name]["hard_receiving_pts_won"] += my_int(match[w_svpt]) - my_int(match[w_1stWon]) - my_int(match[w_2ndWon])
        ovr_dict[name]["hard_aces"] += my_int(match[l_ace])
    elif match[surface] == "Clay":
        ovr_dict[name]["clay_pts_served"] += my_int(match[l_svpt])
        ovr_dict[name]["clay_service_pts_won"] += my_int(match[l_1stWon]) + my_int(match[l_2ndWon])
        ovr_dict[name]["clay_pts_received"] += my_int(match[w_svpt])
        ovr_dict[name]["clay_receiving_pts_won"] += my_int(match[w_svpt]) - my_int(match[w_1stWon]) - my_int(match[w_2ndWon])
        ovr_dict[name]["clay_aces"] += my_int(match[l_ace])

In [6]:
def extract_stats(file, ovr_dict):
    """
    function for getting info from 1 csv file
    file: string name of csv file
    ovr_dict: the ovr_dict initialized above
    """
    
    with open(file, 'r') as f:
        csvreader = csv.reader(f)
        fields = next(csvreader) # get&burn first row which are the fields
        # for now only need names
        winner_name = fields.index("winner_name")
        loser_name = fields.index("loser_name")
        
        for match in csvreader:
            # determine player match pertains to
            if match[winner_name] in ovr_dict.keys():
                update_w_stats(match,fields,match[winner_name],ovr_dict)
            if match[loser_name] in ovr_dict.keys():
                update_l_stats(match,fields,match[loser_name],ovr_dict)

In [7]:
# extract stats into ovr_dict
for file in f_name:
    f_path = matches_path+"\\"+file
    extract_stats(f_path, ovr_dict)
    
# write ovr_dict into a single csv for R analysis
r_path = players_path+"\\"+r_name
with open(r_path, 'w', newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=f_fields)
    writer.writeheader()
    writer.writerows(list(ovr_dict.values()))    