## Creating A Master Data Frame For One Seasons Worth Of Data 

### Importing required libraries


In [133]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt
from sklearn import datasets
from sklearn.model_selection import train_test_split  
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import copy
from sklearn.linear_model import LogisticRegression
import os
import difflib
import math
import re
import os 

### Define the path which the data is stored
<b><u> Note: </u></b> The path structure must be as follows:
- File_path_to_data_directory
    - MatchUp
        - 2015
        - 2016
        - 2017
        - 2018
        - 2019
    - Box_Office
        - 2015_Box_Office_Individual_Players
        - 2016_Box_Office_Individual_Players
        - 2017_Box_Office_Individual_Players
        - 2018_Box_Office_Individual_Players
        - 2019_Box_Office_Individual_Players
    - Master_Data

### Define the following information and run all cells 

In [134]:
year = 2019
file_path_to_data_directory = os.path.join(os.sep,'Users', 'carlylagrotta', 'Dropbox', 'Columbia','Fall_2020_Semester', 'Data_Science','Data_Science_For_Mechanical_Systems')
file_path_to_write_master_csv_to = os.path.join(os.sep,'Users','carlylagrotta','Desktop','master_df_'+str(year)+'.csv')

### Function to check if a game day matchup file exists for a specific day

In [135]:
def check_if_game_day_matchup_file_exists(gameDate,file_path=file_path_to_data_directory):
    file_name = os.path.join('MatchUp',str(gameDate.year),'PitcherBatterMatchUp_'+str(gameDate.date())+'.csv')
    
    full_path = os.path.join(file_path,file_name)

    
    if not os.path.exists(full_path):
        return False
    else:
        return True

### Function to parse a game day matchup file (if it exists) and return a Pandas DataFrame

In [136]:
def game_day_expected_matchup(gameDate,file_path=file_path_to_data_directory):    
    file_name = os.path.join('MatchUp', str(gameDate.year), 'PitcherBatterMatchUp_'+ str(gameDate.date())+'.csv')
    full_path = os.path.join(file_path,file_name)
    df = pd.read_csv(full_path)
    df.drop(['Unnamed: 0','TeamMatchUp'],axis=1,inplace=True)
    return df

### Function that when passed a single name and a list of names can return a list of the most similar names for the list as compared to the single name it was passed 

In [137]:
def get_closest_name_match(batter_name,path):
    directory_contents = [name for name in os.listdir(path)]
    stripped_csv_name = [name[:-4].replace('_',' ') for name in directory_contents ]
    close =difflib.get_close_matches(batter_name,stripped_csv_name)
    return close
    

### Function that helps determine if the two names it was paseed are the same but potentially could have just been represented differently in two different datasets 

In [138]:

def cleaning_name(closest_name,name):
    nickname_dict = {'Christopher':'Chris',
                 'Chris':'Christopher',
                 'Nicholas':'Nick',
                  'Nick': 'Nicholas',
                 'Matthew':'Matt',
                'Matt':'Matthew',
                'Gio':'Giovanny',
                'Giovanny':'Gio',
                'Nori':'Norichika',
                 'Norichika':'Nori',
                'Jakob':'Jake',
                    'Jake':'Jakob',
                    'Phil':'Phillip',
                    'Phillip':'Phil',
                    'Rafael':'Raffy',
                    'Raffy':'Rafael',
                    'Jonathon':'Jon',
                     'Jon':'Jonathon',
                    'Steven':'Steve',
                    'Steve':'Steven',
                     'Stevie':'Steven',
                     'Steven':'Stevie',
                    'Tommy':'Thomas',
                     'Thomas':'Tommy',
                     'Daniel':'Danny',
                     'Danny':'Daniel',
                     'Rey':'Reymond',
                     'Reymond':'Rey',
                     'Michael':'Mike',
                     'Mike':'Michael',
                     'Nathan':'Nate',
                     'Nate':'Nathan',
                     'Reymond':'Rey',
                     'Rey':'Reymond',
                     'ByungHo':'Byung Ho',
                     'Byung Ho':'ByungHo',
                     'Yuli':'Yulieski',
                     'Yulieski':'Yuli',
                     'David':'Dave',
                     'Dave':'David',
                     'Stevie':'Steven',
                     'Steven':'Stevie',
                     'Pete':'Peter',
                     'Peter':'Pete'
                    }
    
    
    #check for JR
    closest_name_original = copy.deepcopy(closest_name)
    closest_name = closest_name.replace('.','') 
    name = name.replace('.','')      
    
    
    closest_name =  re.sub(r'\bJr\b','',closest_name) 
    name = re.sub(r'\bJr\b','',name)
    
    closest_name = closest_name.replace('-',' ')
    name = name.replace('-',' ')
    
    closest_name_no_apst_without_space = closest_name.replace("'",'')
    name_no_apst_without_space = name.replace("'",'') 
    
    closest_name_no_apst_with_space = closest_name.replace("'",' ')
    name_no_apst_with_space = name.replace("'",' ')     
    
    

    closest_arr = closest_name_no_apst_with_space.strip().split(' ')
    name_arr = name_no_apst_with_space.strip().split(' ')
    
    if closest_arr[-1].strip() == name_arr[-1].strip() and closest_arr[0].strip() == name_arr[0].strip():
        return (True,closest_name_original)
    
    closest_arr_without_space = closest_name_no_apst_without_space.strip().split(' ')
    name_arr_without_space = name_no_apst_without_space.strip().split(' ')
    
    if closest_arr_without_space[-1].strip() == name_arr_without_space[-1].strip() and closest_arr_without_space[0].strip() == name_arr_without_space[0].strip():
        return (True,closest_name_original)

    
    if closest_arr[-1].strip() == name_arr[-1].strip() and closest_arr[0].strip()[0] == name_arr[0].strip()[0]:
        
        if name_arr[0] in nickname_dict.keys():
            name_in_file = nickname_dict[name_arr[0]]
            if name_in_file +' '+" ".join(closest_arr[1:]) == " ".join(closest_arr):
            #print(name_in_file +' '+" ".join(closest_arr[1:]))

                return (True,name_in_file +' '+" ".join(closest_arr[1:]))
        else:
            return False
        
        #return (closest_arr,name_arr)

    elif closest_arr_without_space[-1].strip() == name_arr_without_space[-1].strip() and closest_arr_without_space[0].strip()[0] == name_arr_without_space[0].strip()[0]:
        
        if name_arr_without_space[0] in nickname_dict.keys():
            name_in_file = nickname_dict[name_arr_without_space[0]]
            if name_in_file +' '+" ".join(name_arr_without_space[1:]) == " ".join(closest_arr_without_space):
            #print(name_in_file +' '+" ".join(name_arr_without_space[1:]))

                return (True,name_in_file +' '+" ".join(name_arr_without_space[1:]))   
        else:
            return False
        #return (closest_arr_without_space,name_arr_without_space)


    else:
        return False
    
         



### Function to check if a box office file exists for a specific day

In [139]:
def check_if_box_office_file_exists(batter_name,gameDate,file_path=file_path_to_data_directory):
    #file_name = 'Box_Office/' + str(gameDate.year) + '_Box_Office_Individual_Players/'+batter_name.replace(' ','_')+'.csv'    
    file_name = os.path.join('Box_Office',str(gameDate.year)+'_Box_Office_Individual_Players',batter_name.replace(' ','_')+'.csv')
    full_path = os.path.join(file_path,file_name)
    
    if not os.path.exists(full_path):
        closest_guess = get_closest_name_match(batter_name,os.path.join(file_path,'Box_Office',str(gameDate.year)+ '_Box_Office_Individual_Players',os.sep))
        #closest_guess= get_closest_name_match(batter_name,file_path+'Box_Office/' + str(gameDate.year) + '_Box_Office_Individual_Players/')
        
        if closest_guess == []:
            return False
        else:  
            return cleaning_name(closest_guess[0],batter_name)        
    else:
        return True

### Function to parse a box office file for a specific player (if it exists) and return a Pandas DataFrame for a specific date

In [140]:
def get_csv_file_for_box_office_on_a_date(batter_name,gameDate,file_path=file_path_to_data_directory):
    #file_name = 'Box_Office/' + str(gameDate.year) + '_Box_Office_Individual_Players/'+batter_name.replace(' ','_')+'.csv'
    file_name = os.path.join('Box_Office',str(gameDate.year)+'_Box_Office_Individual_Players',batter_name.replace(' ','_')+'.csv')
    
    #full_path = file_path + file_name
    full_path = os.path.join(file_path,file_name)
    df = pd.read_csv(full_path)
    temp_df =  df[df['Date'].str.contains(str(gameDate.date()))]
    temp_df = temp_df.drop(['Unnamed: 0', 'Batting','R','RBI', 'BB', 'SO',
       'OBP', 'SLG', 'OPS', 'Pit', 'Str', 'WPA', 'aLI', 'WPA+', 'WPA-', 'RE24',
       'PO', 'A', 'Details', 'Team', 'Date'],axis=1)
    if not temp_df.empty:
        temp_df = temp_df.iloc[0]
    return temp_df
    

### Function that finds the age and years of experiance of a specific player

In [141]:
def get_age_and_birthday(batter_name,file_path = file_path_to_data_directory):
    bday_path = os.path.join('Birthday','master_birthday_list.csv')
    df_birthday = pd.read_csv(os.path.join(file_path,bday_path))
    temp_df =  df_birthday[df_birthday['Name'].str.contains(str(batter_name))]
    if not temp_df.empty:
        return temp_df
    else:       
        close =difflib.get_close_matches(batter_name,df_birthday['Name'].to_list())
        if close != []:
            cleaned_result = cleaning_name(close[0],batter_name)
            if type(cleaned_result) == tuple and cleaned_result[0]==True:
                closest_name_original = cleaned_result[1]    
                temp_df =  df_birthday[df_birthday['Name'].str.contains(str(closest_name_original))]
                if not temp_df.empty:
                    return temp_df
                else:
                    return False


### Function to determine the length of the longest list inside a nested list 

In [142]:
def length_of_longest_list(lists):
    longest = len(max(lists, key=len))
    return longest 

### Function to make all the sub lists inside a nested list the same length

In [143]:
def make_all_lists_same_length(longest_list,master_list):
    for i,sublist in enumerate(master_list):
        if len(sublist)< longest_list:
            master_list[i] = sublist*longest_list 
    return master_list

### Function to check the length of all sub lists inside a master list

In [144]:
def checking_len_of_master_list(master_list):
    lengths = []
    for lst in master_list:
        lengths.append(len(lst))
    
    return max(lengths)

### Function to build a temporary data frame for a specific day and player

In [145]:
def bulild_temp_df(master_list_same_length):
    kys = ['Batter',
    'Batter_Handedness',
    'Pitcher_Handedness',
    'Stadium',
    'Batter_Home',
    'Date',
    'Position_In_Lineup',
    'Pitcher',
    'Batting_Average',
    'Number_Of_Plate_Appearances',
    'Batter_Age',
    'Batter_Exp',
    'Pitcher_Age',
    'Pitcher_Exp',
     'Number_Of_Hits']
    temp_dict = dict(zip(kys,master_list_same_length))
    temp_df = pd.DataFrame(temp_dict)
    return temp_df

### Make a list of dates based on the specific season of play

In [146]:
if year == 2019:
    dates = pd.date_range(start="2019-04-13",end="2019-10-20")
elif year == 2018:
    dates = pd.date_range(start="2018-03-30",end="2018-10-28")
elif year ==2017:
    dates = pd.date_range(start="2017-04-02",end="2017-11-01")
elif year ==2016:
    dates = pd.date_range(start="2016-04-03",end="2016-11-02")
elif year ==2015:
    dates = pd.date_range(start="2015-04-04",end="2015-11-01")
else:
    dates = []

### Build a data frame for one season worth of data 

In [147]:
master_df =[]
for counter,d in enumerate(dates):
    print(d)
    check_game_day_matchup_exists_file_result = check_if_game_day_matchup_file_exists(d)

    if check_game_day_matchup_exists_file_result==True:
        #for index, row in game_day_expected_matchup(game_day).iterrows():    
        for index, row in game_day_expected_matchup(d).iterrows():
            player = row['Batter']
            player = player.strip()
            
            #getting data from matchup
            Batter = [row['Batter']]
            Pitcher = [row['PitcherFromTheOtherTeam']]
            Batter_Handedness = [row['BatterHand']]
            Pitcher_Handedness = [row['PitcherHand']]
            Stadium = [row['Standium']]
            Batter_Home = [row['Home/Away']]
            Date = [row['Date']]
            Position_In_Lineup = [row['LineUpPosition']]
            
 #some of these could be longer than other             


            result_box_file_exists = check_if_box_office_file_exists(player,d)
            
            
            if result_box_file_exists==True and type(result_box_file_exists)!=tuple:
                df_temp_box_office = get_csv_file_for_box_office_on_a_date(player,d)
                if not df_temp_box_office.empty:            
                    Batting_Average = [df_temp_box_office['BA']]
                    Number_Of_Plate_Appearances = [df_temp_box_office['PA']]
                    Number_Of_Hits = [df_temp_box_office['H']]
                    
                else:
                    Batting_Average = [np.nan]
                    Number_Of_Plate_Appearances = [np.nan]
                    Number_Of_Hits = [np.nan]    
            
            
            elif type(result_box_file_exists)==tuple and result_box_file_exists[0]==True:
                corrected_batter_name_box_office = result_box_file_exists[1]
                df_temp_box_office = get_csv_file_for_box_office_on_a_date(corrected_batter_name_box_office,d)
                if not df_temp_box_office.empty:            
                    Batting_Average = [df_temp_box_office['BA']]
                    Number_Of_Plate_Appearances = [df_temp_box_office['PA']] 
                    Number_Of_Hits = [df_temp_box_office['H']]
                
                else:
                    Batting_Average = [np.nan]
                    Number_Of_Plate_Appearances = [np.nan]
                    Number_Of_Hits = [np.nan]

            
            #elif result_box_file_exists==False and type(result_box_file_exists)!=tuple:
            else:
                Batting_Average = [np.nan]
                Number_Of_Plate_Appearances = [np.nan]
                Number_Of_Hits = [np.nan]

                    
                    
            birthday_result_batter = get_age_and_birthday(player) 
            if isinstance(birthday_result_batter, pd.DataFrame):
                Batter_Age = [datetime.now().year - int(birthday_result_batter['Born'])] 
                Batter_Exp = birthday_result_batter['Yrs'].to_list()
            else:
                #average age
                Batter_Age = [30]
                Batter_Exp = [3]
            
            Pitcher_Age = []
            Pitcher_Exp=[]
            if isinstance(Pitcher[0],str): 

                #and not math.isnan(Pitcher[0]):
                for pitch in Pitcher: 
                    birthday_result_pitcher = get_age_and_birthday(pitch)        
                    if isinstance(birthday_result_pitcher, pd.DataFrame):
                        Pitcher_Age.append(datetime.now().year - int(birthday_result_pitcher['Born']) )
                        Pitcher_Exp.append(int(birthday_result_pitcher['Yrs']))
                    else:
                        Pitcher_Age.append(30)  
                        Pitcher_Exp.append(3)
            else:
                Pitcher_Age = [np.nan]
                Pitcher_Exp=[np.nan]            

            
            
            
            #build temp df
            #find the length of the longest list 
            master_list = [Batter,
                           Batter_Handedness,
                           Pitcher_Handedness,
                           Stadium,
                           Batter_Home,
                           Date,
                           Position_In_Lineup,
                           Pitcher,
                           Batting_Average,
                           Number_Of_Plate_Appearances,
                           Batter_Age,
                           Batter_Exp,
                           Pitcher_Age,
                           Pitcher_Exp,
                           Number_Of_Hits] 
            #maximum_length = checking_len_of_master_list(master_list)
        #print(Result)
            longest = length_of_longest_list(master_list)
            master_list_equal_lengths = make_all_lists_same_length(longest,master_list)
            temp_df = bulild_temp_df(master_list_equal_lengths)
            master_df.append(temp_df)
            

                
                
                

    elif check_game_day_matchup_exists_file_result==False:
        pass


2019-04-13 00:00:00
2019-04-14 00:00:00
2019-04-15 00:00:00
2019-04-16 00:00:00
2019-04-17 00:00:00
2019-04-18 00:00:00
2019-04-19 00:00:00
2019-04-20 00:00:00
2019-04-21 00:00:00
2019-04-22 00:00:00
2019-04-23 00:00:00
2019-04-24 00:00:00
2019-04-25 00:00:00
2019-04-26 00:00:00
2019-04-27 00:00:00
2019-04-28 00:00:00
2019-04-29 00:00:00
2019-04-30 00:00:00
2019-05-01 00:00:00
2019-05-02 00:00:00
2019-05-03 00:00:00
2019-05-04 00:00:00
2019-05-05 00:00:00
2019-05-06 00:00:00
2019-05-07 00:00:00
2019-05-08 00:00:00
2019-05-09 00:00:00
2019-05-10 00:00:00
2019-05-11 00:00:00
2019-05-12 00:00:00
2019-05-13 00:00:00
2019-05-14 00:00:00
2019-05-15 00:00:00
2019-05-16 00:00:00
2019-05-17 00:00:00
2019-05-18 00:00:00
2019-05-19 00:00:00
2019-05-20 00:00:00
2019-05-21 00:00:00
2019-05-22 00:00:00
2019-05-23 00:00:00
2019-05-24 00:00:00
2019-05-25 00:00:00
2019-05-26 00:00:00
2019-05-27 00:00:00
2019-05-28 00:00:00
2019-05-29 00:00:00
2019-05-30 00:00:00
2019-05-31 00:00:00
2019-06-01 00:00:00


### Concatenate temporary data frames into one big data frame, clean it and export it to a csv file

In [148]:
df = pd.concat(master_df,ignore_index=True)
df_cleaned = copy.deepcopy(df)
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned.to_csv(file_path_to_write_master_csv_to)