## Creating Individual Player CSV Files From Box Office Data Set 

### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import os
import time

### Define the path for dated box office data is stored as well as the path to write the individual player files to
<b><u> Note: </u></b> The path structure must be as follows. Also note that 201X individual player directories are empty directories created to have individual player files written to them by year.:
- File_path_to_data_directory
    - Box_Office
        - 2015_Box_Office
        - 2015_Box_Office_Individual_Players
        - 2016_Box_Office
        - 2016_Box_Office_Individual_Players
        - 2017_Box_Office
        - 2017_Box_Office_Individual_Players
        - 2018_Box_Office
        - 2018_Box_Office_Individual_Players
        - 2019_Box_Office
        - 2019_Box_Office_Individual_Players

### Define the following information and run all cells

In [3]:
year = 2016
file_path_to_data_directory = '/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Box_Office/'+str(year)+'_Box_Office'
path_to_write = '/Users/carlylagrotta/Dropbox/Columbia/Fall_2020_Semester/Data_Science/Data_Science_For_Mechanical_Systems/Box_Office/'+str(year)+'_Box_Office_Individual_Players'

### Function to extract the date from a file name and turn in into a date time object  

In [4]:
def get_date_fom_file_string(file_name,year):
    month_dict = {'January':1,
                 'February':2,
                 'March':3,
                 'April':4,
                 'May':5,
                 'June':6,
                 'July':7,
                 'August':8,
                 'September':9,
                 'October':10,
                 'November':11,
                 'December':12}
    temp = file_name.split(',')[2].split('.')[0]    

    
    if  temp.strip() == str(year).strip():
        year = year
        date_temp = file_name.split(',')[1]
        month = date_temp.split(' ')[1]
        month = month_dict[month]
        day  = date_temp.split(' ')[2]   
        
        if day == '':
            day = int(date_temp.split(' ')[3])
        else:
            day = int(date_temp.split(' ')[2]) 
        
    else:
        date =temp.split('.')[0]
        date = date.replace(" ", "_")
        date = date.replace('__','_')
        date = date + '_'+str(year)    
        date = date[1:]


        year = date.split('_')[2]
        day = date.split('_')[1]
        month = date.split('_')[0]
        month = month_dict[month]



    df = pd.DataFrame({'year': [int(year)],
                        'month': [int(month)],
                        'day': [int(day)]})
    df = pd.to_datetime(df)
    date_time = df[0].date()

    
    
    
    return date_time

### Function to create a set of batters names for a given season with no repeated names

In [5]:
def get_list_of_player_names(path):
    #directory_contents = [ name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name)) ]
    player_master_list_2018=[]
    for i,game in enumerate(os.listdir(path)):
        #print((path+'/'+str(directory)))
        combined_variable = path+'/'+str(game)
        if combined_variable.endswith('.csv'):
            df_temp = pd.read_csv(path+'/'+str(game))
            player_master_list_2018.append(df_temp['Batting'].to_list())


    flattened = [val for sublist in player_master_list_2018 for val in sublist]
    individual_names=list(set(flattened)) 
    individual_names_filter_out_floats = []
    for name in individual_names:
        if type(name)==str:
            individual_names_filter_out_floats.append(name)
            
    return individual_names_filter_out_floats

### Function to loop over csv files for individual game dates in a season and create Pandas data frames that are instead specific to individual players (while still storing the date for a specific performance by the player in the file)

In [6]:
def get_list_of_player_df(path,individual_names,year=2016):
    directory_contents = [ name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name)) ]

    list_of_data_frames=[]
    for jj, player_name in enumerate(individual_names):
        player_df = []
        for i,game in enumerate(os.listdir(path)):
            combined_variable = path+'/'+str(game)
            if combined_variable.endswith('.csv'):
                df_temp = pd.read_csv(path+'/'+str(game))
                df_temp = df_temp.drop(['Unnamed: 0'], axis=1)
                date = get_date_fom_file_string(game,year)
                df_temp['Date'] = [date]*df_temp.shape[0]
                df_temp = df_temp.fillna(value='False')
                player_name = str(player_name)
                df1 = df_temp[df_temp['Batting'].str.contains(player_name)] 
                if not df1.empty:
                    player_df.append(df1)
                    
        list_of_data_frames.append(pd.concat(player_df,ignore_index=True))
    return list_of_data_frames

### Function to write the player specific data frames to csv files

In [45]:
def write_list_of_player_df(path_to_write,list_of_data_frames):
    for df in list_of_data_frames:
        name = df['Batting'][0]
        name = name.replace(" ", "_")
        df.to_csv(path_to_write+'/'+name+'.csv')

### Calling all functions and running

In [None]:
individual_names = get_list_of_player_names(file_path_to_data_directory)
list_of_data_frames = get_list_of_player_df(file_path_to_data_directory,individual_names,year=year)
write_list_of_player_df(path_to_write,list_of_data_frames)