In [2]:
import urllib.request
import csv
import os
import os.path
import datetime
import re

In [3]:
def tomorrow(date):
    """function to return tomorrow's date"""
    # converts string to datetime and increments by one day
    date = datetime.datetime.strptime(date,'%Y-%m-%d') + datetime.timedelta(days=1)
    # converts datetime object to a string
    date = datetime.datetime.strftime(date,'%Y-%m-%d')
    return date

In [7]:
def aggregate_data(day):
    """function to return lineups for every game in day"""
    
    url = 'http://www.baseballpress.com/lineups/' + day
    
    # open up the file, get the web content and write it into a text file
    response = urllib.request.urlopen(url)
    webContent = response.read()
    with open(day+'.txt', 'wb') as f1:
        f1.write(webContent)
    f1.close()

    # open up the text file to read
    with open(day +'.txt', 'rt') as f2:
        file_lines = f2.readlines()
        
        # dictionary of team names and their appropriate abbreviation
        team_dict = {'Blue Jays': 'TOR', 'Yankees': 'NYY', 'Twins': 'MIN', 'Tigers': 'DET', 'Rockies': "COL",
                "Brewers": 'MIL', "Red Sox": "BOS", "Phillies": "PHI", "Rays": "TB", "Mets": "NYM",
                "Nationals": "WSH", "Braves": "ATL", 'Marlins': "MIA", "White Sox": "CWS", "Royals": 'KC',
                "Angels": "LAA", "Mariners": "SEA", "Pirates": "PIT", "Astros": "HOU", "Reds": "CIN",
                "Padres": "SD", "Dodgers": "LAD", "Indians": "CLE", "Rangers": "TEX", "Athletics": "OAK",
                "Giants": "SF", "Diamondbacks": "ARI", "Cardinals": "STL", "Cubs": "CHC", "Orioles":"BAL"}

        teams = []
        teams_list = []

        for i in range(0, len(file_lines)-1):
            # finds the row to start to find the teams 
            # to make sure there was a game played that day (i.e. postponed due to rain)
            if ("</svg>" in file_lines[i]) and ("No Lineup Released" not in file_lines[i:i+4]) and ("PPD" not in file_lines[i+5]) and ("PPD" not in file_lines[i-5]): 

                # split on these characters to get team name
                team_name = re.split("<div>|</div>",file_lines[i+1])[1]
                if (len(teams) % 2 == 0): # away team
                    teams_list.append([str(day).strip(), team_dict[team_name.strip()],'Away'])
                else: # home team
                    teams_list.append([str(day).strip(), team_dict[team_name.strip()],'Home'])
                    
                teams.append(team_dict[team_name])
        
        insert_at = 0       
        # finds the opposing team name
        # add a unique identifier for games
        for x in range(len(teams_list)):
            if teams_list[x][-1] == 'Away':
                teams_list[x].append(teams_list[x+1][1])
                teams_list[x][insert_at:insert_at] = [day+teams_list[x+1][1]+teams_list[x][1]]
            else:
                teams_list[x].append(teams_list[x-1][2])
                teams_list[x][insert_at:insert_at] = [day+teams_list[x][1]+teams_list[x-1][2]]




        pitcher_list = []
        for i in range(0, len(file_lines)-1):
            # finds the row to start to find the pitchers
            if ("col col--min player" in file_lines[i]) and ("No Lineup Released" not in file_lines[i:i+4]) and ("PPD" not in file_lines[i-11]) and ("PPD" not in file_lines[i-12]): #to make sure there is a team and lineup
                    pitcher = re.split('<|>|=',file_lines[i])
                    pitcher_list.append(["",pitcher[-5].strip(),pitcher[-3].strip(), "Pitcher",pitcher[-6].strip('"')])


        hitter_list = []
        for i in range(0, len(file_lines)-1):
            # finds the row to start to find the hitters
            if ("col col--min" in file_lines[i]) and ("No Lineup Released" not in file_lines[i:i+4]) and ("PPD" not in file_lines[i-17]) and ("PPD" not in file_lines[i-19]): #to make sure there is a team and lineup
                    hitter = re.split('</div>|</a>',file_lines[i+1])
                    for x in range(len(hitter)):
                        if 'desktop-name' in hitter[x]:
                            hitter_list.append([re.split('</span>|">',hitter[x])[1][0],re.split('</span>|">',hitter[x])[3], hitter[x+1].split(" ")[1], hitter[x+1].split(" ")[2], re.split('</span>|">',hitter[x])[1][-6:]])
                        elif '<div class="player">' in hitter[x]:
                            hitter_list.append([re.split('">',hitter[x])[-2][0],re.split('">',hitter[x])[-1], hitter[x+1].split(" ")[1], hitter[x+1].split(" ")[2], re.split('">',hitter[x])[-2][-6:]])
                        else:
                            pass

        data = []
        # merges all three lists into a list of lists
        for x in range(len(teams_list)):
            # top and bottom are to slice the hitters list to retrieve that team's hitters
            top = x*9
            bottom = (x+1)*9
            
            # the * symbol concatenates the two lists into one rather a list of two sublists
            # [1,2] and [3,4] = [1,2,3,4]
            data.append([*teams_list[x],*pitcher_list[x]])
            for y in range(top,bottom):
                data.append([*teams_list[x],*hitter_list[y]])
    
    os.remove(str(day) + ".txt")
    f2.close()
    
    return data


In [8]:
def main(start_date, end_date):
    '''EX for day input: 2015-03-04 <-- YOU NEED THE 0s'''

    cur_day = start_date
    end_date_p1 = tomorrow(str(end_date))
    
    save_path = "./Data"

    # dictionary of lineup data with the date being the key
    all_data = {}

    # run the function for every day in between the given dates
    while (cur_day != end_date_p1): 
        try:
            all_data[cur_day] = (aggregate_data(str(cur_day)))
            cur_day = tomorrow(str(cur_day))

        except Exception as e:
            cur_day = tomorrow(str(cur_day))
            print(str(e)) #for error checking
                                        
    try: # write the information that was stored in the data array
        complete_name = os.path.join(save_path, start_date + '_' + end_date + '_lineups.csv')
        with open(complete_name, 'wt') as k:

            writer = csv.writer(k, delimiter = ',',lineterminator = "\n")
            # Header row
            writer.writerow(["id","Date", "Team", "Home/Away","Opposing_Team", "Order", "Name", 'Handedness','Position','mlb_id'])
            for day in all_data:
                for row in range(len(all_data[day])):
                    writer.writerow(all_data[day][row])

                
        k.close()

    except Exception as e:
        print(str(e))


In [9]:
main('2018-03-29','2018-03-29')