In [1]:
import pandas as pd
import numpy as np
import glob
import json

In [2]:
mlb = pd.read_csv('for_feature_engineering.csv.gz', compression = 'gzip', index_col = [0])

In [3]:
#function to compute haversine distance for two pairs of coordinates. Takes latitude and longitude of two locations
#in degree format as an input and returns a distance in Kilometers 
def haversine_distance(latitude_1, longitude_1, latitude_2, longitude_2):
    R = 6378.137
    h = np.arcsin( np.sqrt(np.sin( (np.radians(latitude_2) - np.radians(latitude_1))/2)**2 \
                           + np.cos(np.radians(latitude_1))*np.cos(np.radians(latitude_2))*\
                          np.sin( (np.radians(longitude_2) - np.radians(longitude_1))/2)**2))
    return(2 * R * h)

In [4]:
mlb_elo = pd.read_csv('data/mlb_elo.csv')

In [5]:
mlb_elo[mlb_elo.team1 == 'PIT']['season'].unique()

array([2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009,
       2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998,
       1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987,
       1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979, 1978, 1977, 1976,
       1975, 1974, 1973, 1972, 1971, 1970, 1969, 1968, 1967, 1966, 1965,
       1964, 1963, 1962, 1961, 1960, 1959, 1958, 1957, 1956, 1955, 1954,
       1953, 1952, 1951, 1950, 1949, 1948, 1947, 1946, 1945, 1944, 1943,
       1942, 1941, 1940, 1939, 1938, 1937, 1936, 1935, 1934, 1933, 1932,
       1931, 1930, 1929, 1928, 1927, 1926, 1925, 1924, 1923, 1922, 1921,
       1920, 1919, 1918, 1917, 1916, 1915, 1914, 1913, 1912, 1911, 1910,
       1909, 1908, 1907, 1906, 1905, 1904, 1903, 1902, 1901, 1900, 1899,
       1898, 1897, 1896, 1895, 1894, 1893, 1892, 1891, 1890, 1889, 1888,
       1887, 1886, 1885, 1884, 1883, 1882])

In [6]:
issue_list = []
for season in mlb.season.unique():
    for team in mlb.team2.unique():
        if len(mlb[(mlb.season == season) & (mlb.team1 == team)]) ==0 and \
        len(mlb[(mlb.season == season) & (mlb.team2 == team)]) != 0:
            print(season, team)
            issue_list.append((season, team))

1900 PIT
1901 PIT
1901 NYY
1902 PIT
1902 NYY
1903 PIT
1904 PIT
1905 PIT
1906 PIT
1907 PIT
1908 PIT
1961 ANA
2001 PIT
2002 PIT
2003 PIT
2004 PIT
2005 PIT
2006 PIT
2007 PIT
2008 PIT
2009 PIT
2010 PIT
2011 PIT
2012 PIT
2013 PIT
2014 PIT
2015 PIT
2016 PIT
2017 PIT
2018 PIT
2019 PIT


In [6]:
stadium_data = pd.read_csv('data/complete_stadiums.csv', index_col = [0])

In [7]:
stadium_data.head()

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
0,TBD,Tampa Bay Rays,FL,2019,Tropicana Field,27.768333,82.653333,,,,14552,96,97
1,TBD,Tampa Bay Rays,FL,2018,Tropicana Field,27.768333,82.653333,,,,14259,96,97
2,TBD,Tampa Bay Rays,FL,2017,Tropicana Field,27.768333,82.653333,,,,15477,96,96
3,TBD,Tampa Bay Rays,FL,2016,Tropicana Field,27.768333,82.653333,,,,15879,95,95
4,TBD,Tampa Bay Rays,FL,2015,Tropicana Field,27.768333,82.653333,,,,15322,97,96


In [8]:
stadium_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2532 entries, 0 to 2531
Data columns (total 13 columns):
team_code               2532 non-null object
team                    2532 non-null object
state_code              2532 non-null object
year                    2532 non-null int64
primary_stadium         2532 non-null object
primary_latitude        2532 non-null float64
primary_longitude       2532 non-null float64
secondary_stadium       67 non-null object
secondary_latitude      67 non-null object
secondary_longitude     67 non-null object
attendance/game         2532 non-null object
pitching_park_factor    2532 non-null int64
batting_park_factor     2532 non-null int64
dtypes: float64(2), int64(3), object(8)
memory usage: 276.9+ KB


In [9]:
pit_stadiums = stadium_data[stadium_data.team_code == 'PIT']
nyy_stadiums = stadium_data[stadium_data.team_code == 'NYY']
ana_stadiums = stadium_data[stadium_data.team_code == 'ANA']

pit_probs_1 = pit_stadiums[pit_stadiums.year.between(1900, 1908)]
pit_probs_2 = pit_stadiums[pit_stadiums.year.between(2001, 2019)]
nyy_probs = nyy_stadiums[(nyy_stadiums.year == 1901) | (nyy_stadiums.year == 1902)]
ana_prob = ana_stadiums[ana_stadiums.year == 1961]

In [10]:
pit_probs_final = pd.concat([pit_probs_1, pit_probs_2], axis = 0).sort_values(by = 'year', ascending = True)

In [11]:
pit_probs_final

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
882,PIT,Pittsburgh Pirates,PA,1900,Exposition Park III,40.446944,80.010833,,,,3771,98,100
881,PIT,Pittsburgh Pirates,PA,1901,Exposition Park III,40.446944,80.010833,,,,3652,99,104
880,PIT,Pittsburgh Pirates,PA,1902,Exposition Park III,40.446944,80.010833,,,,3434,98,102
879,PIT,Pittsburgh Pirates,PA,1903,Exposition Park III,40.446944,80.010833,,,,4669,100,104
878,PIT,Pittsburgh Pirates,PA,1904,Exposition Park III,40.446944,80.010833,,,,4367,100,102
877,PIT,Pittsburgh Pirates,PA,1905,Exposition Park III,40.446944,80.010833,,,,4732,100,102
876,PIT,Pittsburgh Pirates,PA,1906,Exposition Park III,40.446944,80.010833,,,,5128,101,104
875,PIT,Pittsburgh Pirates,PA,1907,Exposition Park III,40.446944,80.010833,,,,4149,98,101
874,PIT,Pittsburgh Pirates,PA,1908,Exposition Park III,40.446944,80.010833,,,,4967,98,100
781,PIT,Pittsburgh Pirates,PA,2001,PNC Park,40.446944,80.005833,,,,30430,104,103


In [12]:
ana_prob

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
2182,ANA,Los Angeles Angels,CA,1961,Wrigley Field,34.01,118.27,,,,7360,111,110


In [13]:
pit_lat = pit_probs_final.primary_latitude.value_counts().idxmax()
pit_lon = pit_probs_final.primary_longitude.value_counts().idxmax()

ana_lat = ana_prob.primary_latitude.value_counts().idxmax()
ana_lon = ana_prob.primary_longitude.value_counts().idxmax()

## NOTE: The 1901 & 1902 Yankees played at Oriole Park (American League Park) and were known as the AL Orioles!! This is big bad! 

In [14]:
unique_coordinates = pd.read_csv('data/unique_coordinates.csv', index_col = [0])

In [15]:
pd.set_option('max.rows', 100)
unique_coordinates

Unnamed: 0,state,latitude,longitude
0,FL,27.768333,82.653333
1,CA,37.751667,122.200556
2,MO,39.086,94.555
3,PA,39.996111,75.165
4,PA,39.981111,75.182778
5,CA,37.778611,122.389167
6,CA,37.713611,122.386111
7,CA,37.766667,122.409167
8,NY,40.830833,73.9375
9,NY,40.798056,73.950278


# 30, 31, 32, 33

In [16]:
loc_30 = pd.read_csv('data/noaa_station_csvs/location_30/all_city_weather.csv', index_col = [0])
loc_31 = pd.read_csv('data/noaa_station_csvs/location_31/all_city_weather.csv', index_col = [0])
loc_32 = pd.read_csv('data/noaa_station_csvs/location_32/all_city_weather.csv', index_col = [0])
loc_33 = pd.read_csv('data/noaa_station_csvs/location_33/all_city_weather.csv', index_col = [0])

In [17]:
pit_locs = [loc_30, loc_31, loc_32, loc_33]
for loc in pit_locs:
    loc['date'] = pd.to_datetime(loc['date'], format = '%Y-%m-%d')
    loc['year'] = pd.DatetimeIndex(loc['date']).year

In [18]:
loc_30.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46977 entries, 0 to 46976
Data columns (total 17 columns):
station_id    46977 non-null object
date          46977 non-null datetime64[ns]
PRCP          46930 non-null float64
SNOW          40586 non-null float64
SNWD          34794 non-null float64
TMAX          37115 non-null float64
TMIN          37115 non-null float64
PSUN          6947 non-null float64
AWND          13088 non-null float64
WSF2          8521 non-null float64
WSF5          8491 non-null float64
ACSH          11503 non-null float64
WSFG          16167 non-null float64
WSF1          11502 non-null float64
latitude      46977 non-null float64
longitude     46977 non-null float64
year          46977 non-null int64
dtypes: datetime64[ns](1), float64(14), int64(1), object(1)
memory usage: 6.5+ MB


In [19]:
loc_30_early = loc_30[loc_30.year.between(1900, 1908)]
loc_31_early = loc_31[loc_31.year.between(1900, 1908)]
loc_32_early = loc_32[loc_32.year.between(1900, 1908)] #early leader in clubhouse
loc_33_early = loc_33[loc_33.year.between(1900, 1908)]

In [20]:
pit_games_missing = mlb_elo[(mlb_elo.season.between(1900, 1908)) & (mlb_elo.team1 == 'PIT')]

In [21]:
pit_games_missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671 entries, 190637 to 200686
Data columns (total 26 columns):
date            671 non-null object
season          671 non-null int64
neutral         671 non-null int64
playoff         4 non-null object
team1           671 non-null object
team2           671 non-null object
elo1_pre        671 non-null float64
elo2_pre        671 non-null float64
elo_prob1       671 non-null float64
elo_prob2       671 non-null float64
elo1_post       671 non-null float64
elo2_post       671 non-null float64
rating1_pre     671 non-null float64
rating2_pre     671 non-null float64
pitcher1        671 non-null object
pitcher2        671 non-null object
pitcher1_rgs    4 non-null float64
pitcher2_rgs    4 non-null float64
pitcher1_adj    0 non-null float64
pitcher2_adj    0 non-null float64
rating_prob1    671 non-null float64
rating_prob2    671 non-null float64
rating1_post    671 non-null float64
rating2_post    671 non-null float64
score1          671

In [22]:
pit_games_missing['date'] = pd.to_datetime(pit_games_missing['date'], format = '%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
pit_early = pit_games_missing.merge(loc_32_early, how = 'left', left_on = ['date'], right_on = ['date'])

In [24]:
pit_early.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671 entries, 0 to 670
Data columns (total 42 columns):
date            671 non-null datetime64[ns]
season          671 non-null int64
neutral         671 non-null int64
playoff         4 non-null object
team1           671 non-null object
team2           671 non-null object
elo1_pre        671 non-null float64
elo2_pre        671 non-null float64
elo_prob1       671 non-null float64
elo_prob2       671 non-null float64
elo1_post       671 non-null float64
elo2_post       671 non-null float64
rating1_pre     671 non-null float64
rating2_pre     671 non-null float64
pitcher1        671 non-null object
pitcher2        671 non-null object
pitcher1_rgs    4 non-null float64
pitcher2_rgs    4 non-null float64
pitcher1_adj    0 non-null float64
pitcher2_adj    0 non-null float64
rating_prob1    671 non-null float64
rating_prob2    671 non-null float64
rating1_post    671 non-null float64
rating2_post    671 non-null float64
score1          671

In [25]:
mlb_elo['date'] = pd.to_datetime(mlb_elo['date'], format = '%Y-%m-%d')

In [26]:
pit_late_missing = mlb_elo[(mlb_elo.season.between(2001, 2019)) & (mlb_elo.team1 == 'PIT')]

In [27]:
loc_30_late = loc_30[loc_30.year.between(2001, 2019)]
loc_31_late = loc_31[loc_31.year.between(2001, 2019)]
loc_32_late = loc_32[loc_32.year.between(2001, 2019)] 
loc_33_late = loc_33[loc_33.year.between(2001, 2019)]

In [28]:
pit_late = pit_late_missing.merge(loc_30_late, how = 'left', left_on = ['date'], right_on = ['date'])

In [29]:
all_pit_missing = pd.concat([pit_early, pit_late], axis = 0)

In [30]:
all_pit_missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2212 entries, 0 to 1540
Data columns (total 42 columns):
date            2212 non-null datetime64[ns]
season          2212 non-null int64
neutral         2212 non-null int64
playoff         9 non-null object
team1           2212 non-null object
team2           2212 non-null object
elo1_pre        2212 non-null float64
elo2_pre        2212 non-null float64
elo_prob1       2212 non-null float64
elo_prob2       2212 non-null float64
elo1_post       2212 non-null float64
elo2_post       2212 non-null float64
rating1_pre     2212 non-null float64
rating2_pre     2212 non-null float64
pitcher1        2212 non-null object
pitcher2        2212 non-null object
pitcher1_rgs    1545 non-null float64
pitcher2_rgs    1545 non-null float64
pitcher1_adj    1541 non-null float64
pitcher2_adj    1541 non-null float64
rating_prob1    2212 non-null float64
rating_prob2    2212 non-null float64
rating1_post    2212 non-null float64
rating2_post    2212 non

In [31]:
stadium_data[(stadium_data.team_code == 'ANA') & (stadium_data.year == 1961)]

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
2182,ANA,Los Angeles Angels,CA,1961,Wrigley Field,34.01,118.27,,,,7360,111,110


In [32]:
loc_25 = pd.read_csv('data/noaa_station_csvs/location_25/all_city_weather.csv', index_col = [0])
loc_26 = pd.read_csv('data/noaa_station_csvs/location_26/all_city_weather.csv', index_col = [0])

In [33]:
ana_locs = [loc_25, loc_26]
for loc in ana_locs:
    loc['date'] = pd.to_datetime(loc['date'], format = '%Y-%m-%d')
    loc['year'] = pd.DatetimeIndex(loc['date']).year

In [34]:
ana_1961 = loc_25[loc_25.year == 1961]

In [35]:
mlb_elo_ana_1961 = mlb_elo[(mlb_elo.season == 1961) & (mlb_elo.team1 == 'ANA')]

In [36]:
mlb_ana = mlb_elo_ana_1961.merge(ana_1961, how = 'left', left_on = ['date'], right_on = ['date'])

In [37]:
mlb_ana.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82 entries, 0 to 81
Data columns (total 44 columns):
date            82 non-null datetime64[ns]
season          82 non-null int64
neutral         82 non-null int64
playoff         0 non-null object
team1           82 non-null object
team2           82 non-null object
elo1_pre        82 non-null float64
elo2_pre        82 non-null float64
elo_prob1       82 non-null float64
elo_prob2       82 non-null float64
elo1_post       82 non-null float64
elo2_post       82 non-null float64
rating1_pre     82 non-null float64
rating2_pre     82 non-null float64
pitcher1        82 non-null object
pitcher2        82 non-null object
pitcher1_rgs    82 non-null float64
pitcher2_rgs    82 non-null float64
pitcher1_adj    82 non-null float64
pitcher2_adj    82 non-null float64
rating_prob1    82 non-null float64
rating_prob2    82 non-null float64
rating1_post    82 non-null float64
rating2_post    82 non-null float64
score1          82 non-null int64
sc

In [38]:
mlb_nyy_probs = mlb_elo[(mlb_elo.season.between(1901, 1902)) & (mlb_elo.team1 == 'NYY')]

In [39]:
stadium_data[stadium_data.primary_stadium.str.contains('American League Park')]

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
1154,MIN,Washington Senators,D.C.,1910,American League Park II,38.901667,76.986667,,,,3306,99,95
1155,MIN,Washington Senators,D.C.,1909,American League Park II,38.901667,76.986667,,,,2665,99,96
1156,MIN,Washington Senators,D.C.,1908,American League Park II,38.901667,76.986667,,,,3388,95,91
1157,MIN,Washington Senators,D.C.,1907,American League Park II,38.901667,76.986667,,,,2959,95,92
1158,MIN,Washington Senators,D.C.,1906,American League Park II,38.901667,76.986667,,,,1732,98,95
1159,MIN,Washington Senators,D.C.,1905,American League Park II,38.901667,76.986667,,,,3273,99,96
1160,MIN,Washington Senators,D.C.,1904,American League Park II,38.901667,76.986667,,,,1689,103,98
1161,MIN,Washington Senators,D.C.,1903,American League Park I,38.901667,76.986667,,,,1815,105,101
1162,MIN,Washington Senators,D.C.,1902,American League Park I,38.901667,76.986667,,,,2767,103,100
1163,MIN,Washington Senators,D.C.,1901,American League Park I,38.901667,76.986667,,,,2377,100,99


In [40]:
loc_42 = pd.read_csv('data/noaa_station_csvs/location_42/all_city_weather.csv', index_col = [0])
loc_43 = pd.read_csv('data/noaa_station_csvs/location_43/all_city_weather.csv', index_col = [0])

In [41]:
nyy_locs = [loc_42, loc_43]
for loc in nyy_locs:
    loc['date'] = pd.to_datetime(loc['date'], format = '%Y-%m-%d')
    loc['year'] = pd.DatetimeIndex(loc['date']).year

In [42]:
loc_42_inq = loc_42[loc_42.year.between(1901, 1902)]
loc_43_inq = loc_43[loc_43.year.between(1901, 1902)]

In [43]:
mlb_nyy_full = mlb_nyy_probs.merge(loc_42_inq, how = 'left', left_on = ['date'], right_on = ['date'])

In [44]:
mlb_nyy_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130 entries, 0 to 129
Data columns (total 37 columns):
date            130 non-null datetime64[ns]
season          130 non-null int64
neutral         130 non-null int64
playoff         0 non-null object
team1           130 non-null object
team2           130 non-null object
elo1_pre        130 non-null float64
elo2_pre        130 non-null float64
elo_prob1       130 non-null float64
elo_prob2       130 non-null float64
elo1_post       130 non-null float64
elo2_post       130 non-null float64
rating1_pre     130 non-null float64
rating2_pre     130 non-null float64
pitcher1        128 non-null object
pitcher2        128 non-null object
pitcher1_rgs    0 non-null float64
pitcher2_rgs    0 non-null float64
pitcher1_adj    0 non-null float64
pitcher2_adj    0 non-null float64
rating_prob1    130 non-null float64
rating_prob2    130 non-null float64
rating1_post    130 non-null float64
rating2_post    130 non-null float64
score1          130

In [45]:
all_missing_vals = pd.concat([mlb_nyy_full, mlb_ana, all_pit_missing], axis = 0, sort = False)

In [46]:
all_missing_vals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2424 entries, 0 to 1540
Data columns (total 44 columns):
date            2424 non-null datetime64[ns]
season          2424 non-null int64
neutral         2424 non-null int64
playoff         9 non-null object
team1           2424 non-null object
team2           2424 non-null object
elo1_pre        2424 non-null float64
elo2_pre        2424 non-null float64
elo_prob1       2424 non-null float64
elo_prob2       2424 non-null float64
elo1_post       2424 non-null float64
elo2_post       2424 non-null float64
rating1_pre     2424 non-null float64
rating2_pre     2424 non-null float64
pitcher1        2422 non-null object
pitcher2        2422 non-null object
pitcher1_rgs    1627 non-null float64
pitcher2_rgs    1627 non-null float64
pitcher1_adj    1623 non-null float64
pitcher2_adj    1623 non-null float64
rating_prob1    2424 non-null float64
rating_prob2    2424 non-null float64
rating1_post    2424 non-null float64
rating2_post    2424 non

In [47]:
all_missing_vals = all_missing_vals.reset_index()

In [48]:
stadium_data[stadium_data.primary_stadium == 'American League Park I']

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
1161,MIN,Washington Senators,D.C.,1903,American League Park I,38.901667,76.986667,,,,1815,105,101
1162,MIN,Washington Senators,D.C.,1902,American League Park I,38.901667,76.986667,,,,2767,103,100
1163,MIN,Washington Senators,D.C.,1901,American League Park I,38.901667,76.986667,,,,2377,100,99


In [49]:
year_list = [1901, 1902]
nyy_list = []
for year in year_list:
    val_dict = {'team_code' : 'NYY',
               'team' : 'Baltimore Orioles',
               'state_code' : 'D.C.',
               'year' : year,
               'primary_stadium' : 'American League Park I',
               'primary_latitude' : 38.901667,
               'primary_longitude' : 76.986667,
               'secondary_stadium' : np.nan,
               'secondary_latitude': np.nan,
               'secondary_longitude' : np.nan,
               'attendance/game' : '1,815',
               'pitching_park_factor' : 100,
               'batting_park_factor' : 99}
    nyy_list.append(val_dict)


In [50]:
nyy_update = pd.DataFrame(nyy_list)

In [51]:
nyy_update

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
0,NYY,Baltimore Orioles,D.C.,1901,American League Park I,38.901667,76.986667,,,,1815,100,99
1,NYY,Baltimore Orioles,D.C.,1902,American League Park I,38.901667,76.986667,,,,1815,100,99


In [52]:
stadium_data = pd.concat([stadium_data, nyy_update], axis = 0)

In [53]:
stadium_data = stadium_data.reset_index()

In [54]:
all_missing_vals = all_missing_vals.merge(stadium_data, how = 'left', left_on = ['year', 'team1'], 
                                         right_on = ['year', 'team_code'])

In [55]:
all_missing_vals = all_missing_vals.drop(columns = ['index_x', 'index_y'])

In [56]:
all_missing_vals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2424 entries, 0 to 2423
Data columns (total 56 columns):
date                    2424 non-null datetime64[ns]
season                  2424 non-null int64
neutral                 2424 non-null int64
playoff                 9 non-null object
team1                   2424 non-null object
team2                   2424 non-null object
elo1_pre                2424 non-null float64
elo2_pre                2424 non-null float64
elo_prob1               2424 non-null float64
elo_prob2               2424 non-null float64
elo1_post               2424 non-null float64
elo2_post               2424 non-null float64
rating1_pre             2424 non-null float64
rating2_pre             2424 non-null float64
pitcher1                2422 non-null object
pitcher2                2422 non-null object
pitcher1_rgs            1627 non-null float64
pitcher2_rgs            1627 non-null float64
pitcher1_adj            1623 non-null float64
pitcher2_adj            

In [57]:
mlb_update = pd.read_csv('data/mlb_final.csv', index_col = [0], low_memory = False)

In [58]:
mlb_update.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 198286 entries, 0 to 198285
Data columns (total 57 columns):
date                    198286 non-null object
season                  198286 non-null int64
neutral                 198286 non-null int64
playoff                 1608 non-null object
team1                   198286 non-null object
team2                   198286 non-null object
elo1_pre                198286 non-null float64
elo2_pre                198286 non-null float64
elo_prob1               198286 non-null float64
elo_prob2               198286 non-null float64
elo1_post               198286 non-null float64
elo2_post               198286 non-null float64
rating1_pre             198286 non-null float64
rating2_pre             198286 non-null float64
pitcher1                198278 non-null object
pitcher2                198278 non-null object
pitcher1_rgs            184642 non-null float64
pitcher2_rgs            184642 non-null float64
pitcher1_adj            183965 non-nu

In [59]:
new_cols = list(all_missing_vals.columns)
old_cols = list(mlb_update.columns)

In [60]:
len(new_cols), len(old_cols)

(56, 57)

In [61]:
for col in old_cols:
    if col not in new_cols:
        print(col)

ACSC


In [62]:
all_missing_vals = all_missing_vals.assign(ACSC = None)

In [63]:
all_missing_vals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2424 entries, 0 to 2423
Data columns (total 57 columns):
date                    2424 non-null datetime64[ns]
season                  2424 non-null int64
neutral                 2424 non-null int64
playoff                 9 non-null object
team1                   2424 non-null object
team2                   2424 non-null object
elo1_pre                2424 non-null float64
elo2_pre                2424 non-null float64
elo_prob1               2424 non-null float64
elo_prob2               2424 non-null float64
elo1_post               2424 non-null float64
elo2_post               2424 non-null float64
rating1_pre             2424 non-null float64
rating2_pre             2424 non-null float64
pitcher1                2422 non-null object
pitcher2                2422 non-null object
pitcher1_rgs            1627 non-null float64
pitcher2_rgs            1627 non-null float64
pitcher1_adj            1623 non-null float64
pitcher2_adj            

In [64]:
rs_gl = pd.read_csv('data/retrosheet_gamelog.csv', index_col = [0])

In [65]:
rs_gl['date'] = pd.to_datetime(rs_gl['date'], format = '%Y-%m-%d')

In [66]:
rs_gl['year'] = pd.DatetimeIndex(rs_gl['date']).year

In [67]:
nyy_rg_ = rs_gl[rs_gl.year.between(1901, 1902)]

In [68]:
nyy_rg_.home_team.unique()

array(['PHI', 'BSN', 'BRO', 'SLN', 'CIN', 'CHA', 'DET', 'PHA', 'BLA',
       'NY1', 'CHN', 'PIT', 'WS1', 'CLE', 'MLA', 'BOS', 'SLA'],
      dtype=object)

In [69]:
rs_gl.head()

Unnamed: 0,date,away_team,home_team,game_time,park_id,attendance,is_double_header,is_triple_header,year
0,1900-04-19,PHI,BSN,,BOS05,10000.0,0,0,1900
1,1900-04-19,CHN,CIN,,CIN05,12000.0,0,0,1900
2,1900-04-19,BRO,NY1,,NYC10,15000.0,0,0,1900
3,1900-04-19,PIT,SLN,,STL05,12000.0,0,0,1900
4,1900-04-20,BRO,NY1,,NYC10,,0,0,1900


## BLA is the NYY code in question

In [70]:
rs_gl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197742 entries, 0 to 197741
Data columns (total 9 columns):
date                197742 non-null datetime64[ns]
away_team           197742 non-null object
home_team           197742 non-null object
game_time           192583 non-null object
park_id             197742 non-null object
attendance          166920 non-null float64
is_double_header    197742 non-null int64
is_triple_header    197742 non-null int64
year                197742 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 15.1+ MB


In [71]:
all_missing_vals.team.unique()

array(['Baltimore Orioles', 'Los Angeles Angels', 'Pittsburgh Pirates'],
      dtype=object)

In [72]:
rg_code_dict = {'Baltimore Orioles' : 'BLA',
               'Los Angeles Angels' : 'LAA',
               'Pittsburgh Pirates' : 'PIT'}

In [73]:
all_missing_vals['rg_code'] = all_missing_vals['team'].map(rg_code_dict)

In [74]:
all_missing_vals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2424 entries, 0 to 2423
Data columns (total 58 columns):
date                    2424 non-null datetime64[ns]
season                  2424 non-null int64
neutral                 2424 non-null int64
playoff                 9 non-null object
team1                   2424 non-null object
team2                   2424 non-null object
elo1_pre                2424 non-null float64
elo2_pre                2424 non-null float64
elo_prob1               2424 non-null float64
elo_prob2               2424 non-null float64
elo1_post               2424 non-null float64
elo2_post               2424 non-null float64
rating1_pre             2424 non-null float64
rating2_pre             2424 non-null float64
pitcher1                2422 non-null object
pitcher2                2422 non-null object
pitcher1_rgs            1627 non-null float64
pitcher2_rgs            1627 non-null float64
pitcher1_adj            1623 non-null float64
pitcher2_adj            

In [75]:
all_missing_vals = all_missing_vals.assign(is_double_header =0)
all_missing_vals = all_missing_vals.assign(is_triple_header = 0)

#group by date and count home teams
double_headers_elo = all_missing_vals.groupby('date').team1.value_counts()
double_headers_elo = double_headers_elo[double_headers_elo > 1]
triple_headers_elo = double_headers_elo[double_headers_elo > 2]

#generate list of dates and home teams
all_double_headers_home_elo = []
for j in range(len(double_headers_elo)):
    home_team = [double_headers_elo.index[j][0], double_headers_elo.index[j][1]]
    all_double_headers_home_elo.append(home_team)

In [76]:
len(all_double_headers_home_elo)

115

In [77]:
#function to change value of new features for second (or third) games on one day. Takes the gamelog dataframe and 
#the list of dates and home team names as an argument, and changes the value of the respective dummy variables
#at each index of a double header or triple header
def assign_multigame_values(df, team_list):
    for entry in team_list:
        games = df[(df.date == entry[0]) & (df.team1 == entry[1])]
        indices = []
        for j in range(len(games)):
            indices.append(games.index[j])
        if len(indices) == 2:
            df.at[indices[1], 'is_double_header'] = 1
        elif len(indices) == 3:
            df.at[indices[1], 'is_double_header'] = 1
            df.at[indices[2], 'is_triple_header'] = 1
        else:
            print(games)

In [78]:
#function to alter values in dataframe
assign_multigame_values(all_missing_vals, all_double_headers_home_elo)

In [79]:
all_missing_vals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2424 entries, 0 to 2423
Data columns (total 60 columns):
date                    2424 non-null datetime64[ns]
season                  2424 non-null int64
neutral                 2424 non-null int64
playoff                 9 non-null object
team1                   2424 non-null object
team2                   2424 non-null object
elo1_pre                2424 non-null float64
elo2_pre                2424 non-null float64
elo_prob1               2424 non-null float64
elo_prob2               2424 non-null float64
elo1_post               2424 non-null float64
elo2_post               2424 non-null float64
rating1_pre             2424 non-null float64
rating2_pre             2424 non-null float64
pitcher1                2422 non-null object
pitcher2                2422 non-null object
pitcher1_rgs            1627 non-null float64
pitcher2_rgs            1627 non-null float64
pitcher1_adj            1623 non-null float64
pitcher2_adj            

In [80]:
all_missing_vals['is_double_header'].value_counts()

0    2309
1     115
Name: is_double_header, dtype: int64

In [81]:
missing_updated = all_missing_vals.merge(rs_gl, how = 'left', left_on =['date', 'rg_code', 'is_double_header'],
                                        right_on = ['date', 'home_team', 'is_double_header'])

In [82]:
missing_updated.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2424 entries, 0 to 2423
Data columns (total 67 columns):
date                    2424 non-null datetime64[ns]
season                  2424 non-null int64
neutral                 2424 non-null int64
playoff                 9 non-null object
team1                   2424 non-null object
team2                   2424 non-null object
elo1_pre                2424 non-null float64
elo2_pre                2424 non-null float64
elo_prob1               2424 non-null float64
elo_prob2               2424 non-null float64
elo1_post               2424 non-null float64
elo2_post               2424 non-null float64
rating1_pre             2424 non-null float64
rating2_pre             2424 non-null float64
pitcher1                2422 non-null object
pitcher2                2422 non-null object
pitcher1_rgs            1627 non-null float64
pitcher2_rgs            1627 non-null float64
pitcher1_adj            1623 non-null float64
pitcher2_adj            

In [83]:
#read in aggregate dataset from all data collection sources
mlb = pd.read_csv('data/mlb_final_pre_eda.csv', low_memory = False, index_col = [0])

In [84]:
mlb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197685 entries, 0 to 198285
Data columns (total 67 columns):
date                    197685 non-null object
season                  197685 non-null int64
neutral                 197685 non-null int64
playoff                 1608 non-null object
team1                   197685 non-null object
team2                   197685 non-null object
elo1_pre                197685 non-null float64
elo2_pre                197685 non-null float64
elo_prob1               197685 non-null float64
elo_prob2               197685 non-null float64
elo1_post               197685 non-null float64
elo2_post               197685 non-null float64
rating1_pre             197685 non-null float64
rating2_pre             197685 non-null float64
pitcher1                197677 non-null object
pitcher2                197677 non-null object
pitcher1_rgs            184221 non-null float64
pitcher2_rgs            184221 non-null float64
pitcher1_adj            183544 non-nu

In [85]:
new_cols = list(missing_updated.columns)
needed_cols = list(mlb.columns)

In [86]:
len(new_cols)

67

In [87]:
len(needed_cols)

67

In [88]:
for col in needed_cols:
    if col not in new_cols:
        print(col)

year
is_triple_header
team_name
park_id_x
park_id_y


In [89]:
for col in new_cols:
    if col not in needed_cols:
        print(col)

year_x
is_triple_header_x
park_id
is_triple_header_y
year_y


In [90]:
missing_updated = missing_updated.drop(columns = ['is_triple_header_y', 'year_y'])

In [96]:
missing_updated = missing_updated.rename({'year_x' : 'year', 'is_triple_header_x' : 'is_triple_header'}, 
                                        axis = 'columns')

In [97]:
missing_updated = missing_updated.assign(park_id_y = None)
missing_updated = missing_updated.rename({'park_id' : 'park_id_x'}, axis = 'columns')

In [93]:
missing_updated = missing_updated.assign(team_name = None)

In [98]:
new_cols = list(missing_updated.columns)
needed_cols = list(mlb.columns)

for col in needed_cols:
    if col not in new_cols:
        print(col)

In [100]:
for col in new_cols:
    if col not in needed_cols:
        print(col)

In [99]:
missing_updated.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2424 entries, 0 to 2423
Data columns (total 67 columns):
date                    2424 non-null datetime64[ns]
season                  2424 non-null int64
neutral                 2424 non-null int64
playoff                 9 non-null object
team1                   2424 non-null object
team2                   2424 non-null object
elo1_pre                2424 non-null float64
elo2_pre                2424 non-null float64
elo_prob1               2424 non-null float64
elo_prob2               2424 non-null float64
elo1_post               2424 non-null float64
elo2_post               2424 non-null float64
rating1_pre             2424 non-null float64
rating2_pre             2424 non-null float64
pitcher1                2422 non-null object
pitcher2                2422 non-null object
pitcher1_rgs            1627 non-null float64
pitcher2_rgs            1627 non-null float64
pitcher1_adj            1623 non-null float64
pitcher2_adj            

In [102]:
mlb_full_final = pd.concat([mlb, missing_updated], axis = 0, sort = False)

In [103]:
mlb_full_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200109 entries, 0 to 2423
Data columns (total 67 columns):
date                    200109 non-null object
season                  200109 non-null int64
neutral                 200109 non-null int64
playoff                 1617 non-null object
team1                   200109 non-null object
team2                   200109 non-null object
elo1_pre                200109 non-null float64
elo2_pre                200109 non-null float64
elo_prob1               200109 non-null float64
elo_prob2               200109 non-null float64
elo1_post               200109 non-null float64
elo2_post               200109 non-null float64
rating1_pre             200109 non-null float64
rating2_pre             200109 non-null float64
pitcher1                200099 non-null object
pitcher2                200099 non-null object
pitcher1_rgs            185848 non-null float64
pitcher2_rgs            185848 non-null float64
pitcher1_adj            185167 non-null

In [104]:
mlb_full_final.to_csv('data/mlb_final_pre_eda.csv')