In [54]:
# imports

import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import warnings
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

# Reading in the Data

Before we can do any data analysis, we need to be able to actually find and read in data. After doing some research, we found a website that basically had an archive for data for multiple sports (including baseball). Because we couldn't just download the datasets and because we didn't want to manually type out the datapoints, we decided to utilize web scraping to read in the data for us.

The first step was to come up with all the links that we were planning to scrape. Because the first page of the search result contained hyperlinks to all the other pages of the search result, we scraped the first page for all the HTMLs and then created a list of URLs to scrape.

In [29]:
# base link used for each page 
base_link = 'https://www.prosportstransactions.com/baseball/Search/' 
# link to the first page of the search result 
link = 'https://www.prosportstransactions.com/baseball/Search/SearchResults.php?Player=&Team=&BeginDate=2000-01-01&EndDate=2019-11-19&DLChkBx=yes&submit=Search'



In [30]:
r = requests.get(link)
urlText = r.text

In [31]:
soup = BeautifulSoup(urlText, 'html.parser') # reads everything into something more readable than HTML 

In [32]:
all_links  = soup.find_all('a') # finds all links within the page 

In [33]:
# removes first and last 4 links, which are extraneous links
important_links = [x.get('href') for x in all_links[4:-4]] 

# adds base URL to each link to make each link a proper URL link for each page 
final_links = [link]
final_links = final_links+[base_link+x for x in important_links]

In [163]:
len(final_links)

855

final_links contains the final list of all the URLs we're planning to scrape. We can then create a method that will loop through the links and properly scrape the data and concatenate it onto a dataframe. This method is timed and only used once: delays are set so that the website isn’t overloaded with traffic, and once the method runs through once the dataset gets saved as a csv to ensure that the website isn’t abused.

In [34]:
def number_cycles(links, n=3,sleep_time=2):
    '''
    This method takes in a list of links and scrapes the data, neatly organizing it into a dataframe. 
    Parameters: 
        Links: the list of URL links that you want to scrape 
        n: the number of links you want to read in from the list, default 3 
        sleep_time: the time the loop waits before reading in the website, default 2 seconds, TIME SHOULD BE INCREASED
            IF LENGTH OF LIST OF URLS IS LARGE
    Returns: 
        returns the dataframe containing the data scraped from all the links from the inputted list 
    '''
    final_df = pd.DataFrame()
    for i in range(n):
        print('working on page '+str(i))
        sleep(sleep_time)
        test = pd.read_html(links[i],header = 0)[0]
        final_df = pd.concat([final_df, test],ignore_index = True)
    return final_df

In [164]:
df = number_cycles(final_links,len(final_links),sleep_time=0)

working on page 0
working on page 1
working on page 2
working on page 3
working on page 4
working on page 5
working on page 6
working on page 7
working on page 8
working on page 9
working on page 10
working on page 11
working on page 12
working on page 13
working on page 14
working on page 15
working on page 16
working on page 17
working on page 18
working on page 19
working on page 20
working on page 21
working on page 22
working on page 23
working on page 24
working on page 25
working on page 26
working on page 27
working on page 28
working on page 29
working on page 30
working on page 31
working on page 32
working on page 33
working on page 34
working on page 35
working on page 36
working on page 37
working on page 38
working on page 39
working on page 40
working on page 41
working on page 42
working on page 43
working on page 44
working on page 45
working on page 46
working on page 47
working on page 48
working on page 49
working on page 50
working on page 51
working on page 52
wor

working on page 416
working on page 417
working on page 418
working on page 419
working on page 420
working on page 421
working on page 422
working on page 423
working on page 424
working on page 425
working on page 426
working on page 427
working on page 428
working on page 429
working on page 430
working on page 431
working on page 432
working on page 433
working on page 434
working on page 435
working on page 436
working on page 437
working on page 438
working on page 439
working on page 440
working on page 441
working on page 442
working on page 443
working on page 444
working on page 445
working on page 446
working on page 447
working on page 448
working on page 449
working on page 450
working on page 451
working on page 452
working on page 453
working on page 454
working on page 455
working on page 456
working on page 457
working on page 458
working on page 459
working on page 460
working on page 461
working on page 462
working on page 463
working on page 464
working on page 465


working on page 826
working on page 827
working on page 828
working on page 829
working on page 830
working on page 831
working on page 832
working on page 833
working on page 834
working on page 835
working on page 836
working on page 837
working on page 838
working on page 839
working on page 840
working on page 841
working on page 842
working on page 843
working on page 844
working on page 845
working on page 846
working on page 847
working on page 848
working on page 849
working on page 850
working on page 851
working on page 852
working on page 853
working on page 854


In [165]:
df.to_csv('baseball_datasets/webscraped_dataset.csv')

In [217]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,0,2000-03-23,Rockies,,hn Thomson,placed on 60-day DL recovering from surgery on...
1,1,2000-04-01,Devil Rays,,lson Alvarez,placed on DL
2,2,2000-04-01,Dodgers,,tonio Osuna,placed on 15-day DL
3,3,2000-04-08,Mets,,rryl Hamilton,placed on 15-day DL with sprained left foot
4,4,2000-04-08,Orioles,,ke Timlin,placed on 15-day DL with strained abdominal


Now we have a dataset that we can use! We can begin to analyze and clean it. 

# Understanding and Cleaning the Dataset 

In [245]:
# reading in CSV 
df = pd.read_csv('baseball_datasets/webscraped_dataset.csv')

In [246]:
def dot_remover(x):
    if not (x == x):
        return x 
    else: 
        return x[2:]
    
def dot_cleaner(df):
    df['Acquired'] = df['Acquired'].apply(dot_remover)
    df['Relinquished'] = df['Relinquished'].apply(dot_remover)
    return df

In [247]:
no_dots_df = dot_cleaner(df)

In [248]:
no_dots_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,0,2000-03-23,Rockies,,John Thomson,placed on 60-day DL recovering from surgery on...
1,1,2000-04-01,Devil Rays,,Wilson Alvarez,placed on DL
2,2,2000-04-01,Dodgers,,Antonio Osuna,placed on 15-day DL
3,3,2000-04-08,Mets,,Darryl Hamilton,placed on 15-day DL with sprained left foot
4,4,2000-04-08,Orioles,,Mike Timlin,placed on 15-day DL with strained abdominal


In [249]:
no_dots_df['Date'] = pd.to_datetime(no_dots_df['Date'])

In [250]:
no_dots_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,0,2000-03-23,Rockies,,John Thomson,placed on 60-day DL recovering from surgery on...
1,1,2000-04-01,Devil Rays,,Wilson Alvarez,placed on DL
2,2,2000-04-01,Dodgers,,Antonio Osuna,placed on 15-day DL
3,3,2000-04-08,Mets,,Darryl Hamilton,placed on 15-day DL with sprained left foot
4,4,2000-04-08,Orioles,,Mike Timlin,placed on 15-day DL with strained abdominal


In [251]:
no_dots_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,0,2000-03-23,Rockies,,John Thomson,placed on 60-day DL recovering from surgery on...
1,1,2000-04-01,Devil Rays,,Wilson Alvarez,placed on DL
2,2,2000-04-01,Dodgers,,Antonio Osuna,placed on 15-day DL
3,3,2000-04-08,Mets,,Darryl Hamilton,placed on 15-day DL with sprained left foot
4,4,2000-04-08,Orioles,,Mike Timlin,placed on 15-day DL with strained abdominal


In [252]:
no_dots_df_copy = no_dots_df.copy()

In [253]:
no_dots_df_copy.head()

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,0,2000-03-23,Rockies,,John Thomson,placed on 60-day DL recovering from surgery on...
1,1,2000-04-01,Devil Rays,,Wilson Alvarez,placed on DL
2,2,2000-04-01,Dodgers,,Antonio Osuna,placed on 15-day DL
3,3,2000-04-08,Mets,,Darryl Hamilton,placed on 15-day DL with sprained left foot
4,4,2000-04-08,Orioles,,Mike Timlin,placed on 15-day DL with strained abdominal


In [254]:
# you need to drop columns with NaN in acquired

In [255]:
import numpy as np
abcd =  no_dots_df_copy.replace(np.nan, '', regex=True)
no_dots_df_copy['person'] = abcd['Acquired']+abcd['Relinquished']

In [256]:
no_dots_df_copy.head()

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes,person
0,0,2000-03-23,Rockies,,John Thomson,placed on 60-day DL recovering from surgery on...,John Thomson
1,1,2000-04-01,Devil Rays,,Wilson Alvarez,placed on DL,Wilson Alvarez
2,2,2000-04-01,Dodgers,,Antonio Osuna,placed on 15-day DL,Antonio Osuna
3,3,2000-04-08,Mets,,Darryl Hamilton,placed on 15-day DL with sprained left foot,Darryl Hamilton
4,4,2000-04-08,Orioles,,Mike Timlin,placed on 15-day DL with strained abdominal,Mike Timlin


In [257]:
test = no_dots_df_copy.sort_values(['person','Date'])
test.head(10)

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes,person
3537,3537,2005-05-10,Astros,,,placed on 15-day DL with right shoulder injury,
5583,5583,2007-07-04,Mets,,,transferred to 60-day DL with fractured right ...,
5978,5978,2007-09-21,Royals,,,transferred to 60-day DL,
15776,15776,2016-06-30,Mariners,,,placed on 15-day DL with fractured right hand,
17216,17216,2017-06-23,Padres,,,activated from 10-day DL,
17144,17144,2017-06-15,Red Sox,,(Christopher) Brian Johnson,placed on 10-day DL with left shoulder impinge...,(Christopher) Brian Johnson
17335,17335,2017-07-06,Red Sox,(Christopher) Brian Johnson,,activated from 10-day DL,(Christopher) Brian Johnson
18974,18974,2018-07-08,Red Sox,,(Christopher) Brian Johnson,placed on 10-day DL with left hip inflammation,(Christopher) Brian Johnson
19047,19047,2018-07-15,Red Sox,(Christopher) Brian Johnson,,activated from 10-day DL,(Christopher) Brian Johnson
19892,19892,2019-04-06,Red Sox,,(Christopher) Brian Johnson,placed on 10-day IL with left elbow inflammation,(Christopher) Brian Johnson


In [258]:
# test['Notes'].unique()

In [259]:
is_prev_relinquished = test['Relinquished'].shift(1)==test['Acquired']
test['previous_is_relinquished'] = is_prev_relinquished

In [260]:
def get_day_num(x):
    return x.days
test['known_injury_duration'] = test['Date'].diff().fillna(0).apply(get_day_num)*test['previous_is_relinquished']

In [261]:
def date_negator(x):
    if x == 0:
        return np.nan
    return x
test['known_injury_duration'] = test['known_injury_duration'].apply(date_negator)
test.head()

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes,person,previous_is_relinquished,known_injury_duration
3537,3537,2005-05-10,Astros,,,placed on 15-day DL with right shoulder injury,,False,
5583,5583,2007-07-04,Mets,,,transferred to 60-day DL with fractured right ...,,False,
5978,5978,2007-09-21,Royals,,,transferred to 60-day DL,,False,
15776,15776,2016-06-30,Mariners,,,placed on 15-day DL with fractured right hand,,False,
17216,17216,2017-06-23,Padres,,,activated from 10-day DL,,False,


In [262]:
test['Date'].max()

Timestamp('2019-11-04 00:00:00')

In [263]:
def contains_scott(x):
    if 'giambi' in x.lower():
        return True
    return False

In [264]:
test[test['person'].apply(contains_scott)]

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes,person,previous_is_relinquished,known_injury_duration
2681,2681,2004-05-22,Yankees,,Jason Giambi,placed on 15-day DL with sprained right ankle,Jason Giambi,False,
2755,2755,2004-06-06,Yankees,Jason Giambi,,activated from 15-day DL,Jason Giambi,True,15.0
3026,3026,2004-07-30,Yankees,,Jason Giambi,placed on 15-day DL with unspecified benign tumor,Jason Giambi,False,
3237,3237,2004-09-14,Yankees,Jason Giambi,,activated from 15-day DL,Jason Giambi,True,46.0
5402,5402,2007-06-01,Yankees,,Jason Giambi,placed on 15-day DL with foot injury (plantar ...,Jason Giambi,False,
5749,5749,2007-08-07,Yankees,Jason Giambi,,activated from 15-day DL,Jason Giambi,True,67.0
7840,7840,2009-07-20,Athletics,,Jason Giambi,placed on 15-day DL with strained right quadricep,Jason Giambi,False,
7933,7933,2009-08-07,Athletics,Jason Giambi,,activated from 15-day DL,Jason Giambi,True,18.0
10023,10023,2011-07-26,Rockies,,Jason Giambi,placed on 15-day DL with strained left quadriceps,Jason Giambi,False,
10120,10120,2011-08-12,Rockies,Jason Giambi,,activated from 15-day DL,Jason Giambi,True,17.0


In [266]:
test.head(10)

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes,person,previous_is_relinquished,known_injury_duration
3537,3537,2005-05-10,Astros,,,placed on 15-day DL with right shoulder injury,,False,
5583,5583,2007-07-04,Mets,,,transferred to 60-day DL with fractured right ...,,False,
5978,5978,2007-09-21,Royals,,,transferred to 60-day DL,,False,
15776,15776,2016-06-30,Mariners,,,placed on 15-day DL with fractured right hand,,False,
17216,17216,2017-06-23,Padres,,,activated from 10-day DL,,False,
17144,17144,2017-06-15,Red Sox,,(Christopher) Brian Johnson,placed on 10-day DL with left shoulder impinge...,(Christopher) Brian Johnson,False,
17335,17335,2017-07-06,Red Sox,(Christopher) Brian Johnson,,activated from 10-day DL,(Christopher) Brian Johnson,True,21.0
18974,18974,2018-07-08,Red Sox,,(Christopher) Brian Johnson,placed on 10-day DL with left hip inflammation,(Christopher) Brian Johnson,False,
19047,19047,2018-07-15,Red Sox,(Christopher) Brian Johnson,,activated from 10-day DL,(Christopher) Brian Johnson,True,7.0
19892,19892,2019-04-06,Red Sox,,(Christopher) Brian Johnson,placed on 10-day IL with left elbow inflammation,(Christopher) Brian Johnson,False,


In [269]:
tbm = pd.read_csv('baseball_datasets/Player DataFrame')

In [279]:
test['known_injury_duration']

3537       NaN
5583       NaN
5978       NaN
15776      NaN
17216      NaN
17144      NaN
17335     21.0
18974      NaN
19047      7.0
19892      NaN
20405     69.0
20526      NaN
20799     35.0
13361      NaN
13456     15.0
14489      NaN
14677      NaN
14940      NaN
15152     63.0
14767      NaN
14979     33.0
17153      NaN
17435     36.0
18883      NaN
19045     15.0
19804      NaN
19960     21.0
20197      NaN
20384      NaN
21045     80.0
         ...  
15414      NaN
16421    191.0
17190      NaN
17292     10.0
17445      NaN
17919      NaN
18016     41.0
20623      NaN
20721     11.0
12320      NaN
12646     51.0
17217      NaN
17178      NaN
19442      NaN
18879      NaN
21006      NaN
21092      NaN
21055      NaN
21189      NaN
21212      NaN
21056      NaN
21093      NaN
21067      NaN
21099      NaN
21068      NaN
4147       NaN
21050      NaN
16909      NaN
16906      NaN
21070      NaN
Name: known_injury_duration, Length: 21367, dtype: float64

In [275]:
pd.merge(test,tbm,how='inner',left_on='person',right_on='nameGiven')

Unnamed: 0,Unnamed: 0_x,Date,Team,Acquired,Relinquished,Notes,person,previous_is_relinquished,known_injury_duration,Unnamed: 0_y,...,R,H,2B,3B,HR,RBI,BB,SO,BA,nameGiven
0,16854,2017-05-09,Royals,,Scott Alexander,placed on 10-day DL with strained right hamstring,Scott Alexander,False,,2655,...,39,84,15,6,13,29.0,21,88.0,0.247788,Scott Alexander
1,16854,2017-05-09,Royals,,Scott Alexander,placed on 10-day DL with strained right hamstring,Scott Alexander,False,,2656,...,42,81,18,3,17,31.0,28,84.0,0.248466,Scott Alexander
2,16854,2017-05-09,Royals,,Scott Alexander,placed on 10-day DL with strained right hamstring,Scott Alexander,False,,2657,...,34,62,10,0,10,36.0,31,69.0,0.210169,Scott Alexander
3,16854,2017-05-09,Royals,,Scott Alexander,placed on 10-day DL with strained right hamstring,Scott Alexander,False,,2658,...,52,99,25,3,20,57.0,19,83.0,0.262599,Scott Alexander
4,17086,2017-06-07,Royals,Scott Alexander,,activated from 10-day DL,Scott Alexander,True,29.0,2655,...,39,84,15,6,13,29.0,21,88.0,0.247788,Scott Alexander
5,17086,2017-06-07,Royals,Scott Alexander,,activated from 10-day DL,Scott Alexander,True,29.0,2656,...,42,81,18,3,17,31.0,28,84.0,0.248466,Scott Alexander
6,17086,2017-06-07,Royals,Scott Alexander,,activated from 10-day DL,Scott Alexander,True,29.0,2657,...,34,62,10,0,10,36.0,31,69.0,0.210169,Scott Alexander
7,17086,2017-06-07,Royals,Scott Alexander,,activated from 10-day DL,Scott Alexander,True,29.0,2658,...,52,99,25,3,20,57.0,19,83.0,0.262599,Scott Alexander
8,20375,2019-06-11,Dodgers,,Scott Alexander,placed on 10-day IL with left forearm inflamma...,Scott Alexander,False,,2655,...,39,84,15,6,13,29.0,21,88.0,0.247788,Scott Alexander
9,20375,2019-06-11,Dodgers,,Scott Alexander,placed on 10-day IL with left forearm inflamma...,Scott Alexander,False,,2656,...,42,81,18,3,17,31.0,28,84.0,0.248466,Scott Alexander
