## Creating Individual Player CSV Files From Savant

### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import os
import time

### Define the path for dated box office data is stored as well as the path to write the individual player files to
<b><u> Note: </u></b> The path structure must be as follows. Also note that 201X directories are empty directories created to have individual player files written to them by year.:
- File_path_to_data_directory
    - Box_Office
        - 2015_Savant
        - 2015_Savant_Individual_Players
        - 2016_Savant
        - 2016_Savant_Individual_Players
        - 2017_Savant
        - 2017_Savant_Individual_Players
        - 2018_Savant
        - 2018_Savant_Individual_Players
        - 2019_Savannt
        - 2019_Savant_Individual_Players

In [None]:
year = 2016
file_path_to_data_directory = '/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Savant/'+str(year)+'_Savant'
path_to_write = '/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Savant/'+str(year)+'_Savant_Individual_Players'

### Function to get date from file name


In [2]:
def get_date_fom_file_string(file_name):

    temp = file_name.split('_')[1]    
    year = temp.split('-')[0]
    month = temp.split('-')[1]
    day = temp.split('-')[2]



    df = pd.DataFrame({'year': [int(year)],
                        'month': [int(month)],
                        'day': [int(day)]})
    df = pd.to_datetime(df)
    date_time = df[0].date()
   
    return date_time

### Function to create a set of batters names for a given season with no repeated names


In [3]:
def get_list_of_player_names(path):
    directory_contents = [ name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name)) ]
    player_master_list_2018=[]
    for directory in directory_contents:
        for i,game in enumerate(os.listdir(path+'/'+str(directory))):
            #print((path+'/'+str(directory)))
            combined_variable = path+'/'+str(directory)+'/'+str(game)
            if combined_variable.endswith('.csv'):
                df_temp = pd.read_csv(path+'/'+str(directory)+'/'+str(game))
                player_master_list_2018.append(df_temp['Batter'].to_list())


    flattened = [val for sublist in player_master_list_2018 for val in sublist]
    individual_names=list(set(flattened)) 
    return individual_names

### Function to loop over csv files for individual game dates in a season and create Pandas data frames that are instead specific to individual players (while still storing the date for a specific performance by the player in the file)

In [4]:
def get_list_of_player_df(path,individual_names):
    directory_contents = [ name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name)) ]

    list_of_data_frames=[]
    for player_name in individual_names:
        player_df = []
        for directory in directory_contents:
            for i,game in enumerate(os.listdir(path+'/'+str(directory))):
                combined_variable = path+'/'+str(directory)+'/'+str(game)
                if combined_variable.endswith('.csv'):
                    df_temp = pd.read_csv(path+'/'+str(directory)+'/'+str(game))
                    df_temp = df_temp.drop(['Unnamed: 0'], axis=1)
                    date = get_date_fom_file_string(game)
                    df_temp['Date'] = date
                    df1 = df_temp[df_temp['Batter'].str.contains(player_name)] 
                    
                    if not df1.empty:
                        player_df.append(df1)
        list_of_data_frames.append(pd.concat(player_df,ignore_index=True))
    return list_of_data_frames

### Function to write the player specific data frames to csv files

In [5]:
def write_list_of_player_df(path_to_write,list_of_data_frames):
    for df in list_of_data_frames:
        name = df['Batter'][0]
        name = name.replace(" ", "_")
        df.to_csv(path_to_write+'/'+name+'.csv')

### Calling all functions and running

In [None]:
individual_names = get_list_of_player_names(file_path_to_data_directory)
list_of_data_frames = get_list_of_player_df(file_path_to_data_directory,individual_names)
write_list_of_player_df(path_to_write,list_of_data_frames)

### Get list of player names from all seasons with no repeats to be used in birthday list

In [11]:
def get_list_of_all_player_names(path):
    directory_contents = [ name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name)) ]
    player_master_list_2018=[]
    for directory in directory_contents:
        for i,game in enumerate(os.listdir(path+'/'+str(directory))):
            combined_variable = path+'/'+str(directory)+'/'+str(game)
            if combined_variable.endswith('.csv'):
                df_temp = pd.read_csv(path+'/'+str(directory)+'/'+str(game))
                player_master_list_2018.append(df_temp['Batter'].to_list())
                player_master_list_2018.append(df_temp['Pitcher'].to_list())




    flattened = [val for sublist in player_master_list_2018 for val in sublist]
    individual_names=list(set(flattened)) 
    return individual_names

In [16]:
individual_names_2015 = get_list_of_all_player_names('/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Savant/2015_Savant')
individual_names_2016 = get_list_of_all_player_names('/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Savant/2016_Savant')
individual_names_2017 = get_list_of_all_player_names('/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Savant/2017_Savant')
individual_names_2018 = get_list_of_all_player_names('/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Savant/2018_Savant')
individual_names_2019 = get_list_of_all_player_names('/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Savant/2019_Savant')
master_list = [individual_names_2015+individual_names_2016+individual_names_2017+individual_names_2018+individual_names_2019]
flattened = [val for sublist in master_list for val in sublist]
individual_names=list(set(flattened)) 



### Write out list for later use

In [22]:
df_names = pd.DataFrame(individual_names)
df_names.to_csv('/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Savant/Player_Birthday_CSV.csv')