# Advanced Pandas - Joining Multiple Data Tables
**Abid Ali**

Skype: Abd.Soft

Email: [abdsoftfsd@gmail.com](mailto:abdsoftfsd@gmail.com)


"The information used herein was obtained free of charge from and is
copyrighted by the Hockey Databank project.  For more information about the
Hockey Databank project please visit
[http://sports.groups.yahoo.com/group/hockey-databank](http://sports.groups.yahoo.com/group/hockey-databank)"

In [1]:
import pandas as pd
import os


In [2]:
master = pd.read_pickle(os.path.join('data', 'modified', 'master.pickle'))
scoring = pd.read_pickle(os.path.join('data', 'modified', 'scoring.pickle'))
teams = pd.read_pickle(os.path.join('data', 'modified', 'teams.pickle'))
team_splits = pd.read_pickle(os.path.join('data', 'modified', 'team_splits.pickle'))


In [3]:
master.head(2)


Unnamed: 0_level_0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta
abdelju01,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon


In [4]:
scoring.head(2)


Unnamed: 0,playerID,year,tmID,GP,G,A,Pts,SOG
0,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0
1,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0


In [5]:
teams.head(2)


Unnamed: 0,year,tmID,name
727,1980,BOS,Boston Bruins
728,1980,BUF,Buffalo Sabres


In [6]:
team_splits.head(2)


Unnamed: 0,year,tmID,SepW,SepL,SepT,SepOL,OctW,OctL,OctT,OctOL,...,FebT,FebOL,MarW,MarL,MarT,MarOL,AprW,AprL,AprT,AprOL
727,1980,BOS,,,,,2.0,7.0,1.0,,...,2.0,,8.0,4.0,3.0,,1.0,2.0,0.0,
728,1980,BUF,,,,,5.0,2.0,2.0,,...,3.0,,8.0,6.0,1.0,,1.0,1.0,1.0,


In [7]:
pd.merge(left=master, right=scoring, left_index=True, right_on='playerID').head()


Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG
0,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0
1,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0
2,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1999,ANA,63.0,7.0,11.0,18.0,102.0
3,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,2000,ANA,12.0,1.0,1.0,2.0,18.0
4,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon,abdelju01,2007,DET,2.0,0.0,0.0,0.0,6.0


In [8]:
# where is the resulting dataframe indexing coming from?
# - Answer: from the right table.
scoring.index


RangeIndex(start=0, stop=28616, step=1)

In [9]:
scoring.index + 3


RangeIndex(start=3, stop=28619, step=1)

In [10]:
scoring.index = scoring.index + 3


In [11]:
scoring.head()


Unnamed: 0,playerID,year,tmID,GP,G,A,Pts,SOG
3,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0
4,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0
5,aaltoan01,1999,ANA,63.0,7.0,11.0,18.0,102.0
6,aaltoan01,2000,ANA,12.0,1.0,1.0,2.0,18.0
7,abdelju01,2007,DET,2.0,0.0,0.0,0.0,6.0


In [13]:
pd.merge(master, scoring, left_index=True, right_on="playerID").head()

Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG
3,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0
4,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0
5,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1999,ANA,63.0,7.0,11.0,18.0,102.0
6,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,2000,ANA,12.0,1.0,1.0,2.0,18.0
7,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon,abdelju01,2007,DET,2.0,0.0,0.0,0.0,6.0


index starts with 3, so it is coming from the right table, as we have modified the right table index to start from 3.
Left index is just dropped.
We have playerID column, but it came from the right table.

In [14]:
# merging index on index
pd.merge(master, scoring.set_index('playerID', drop=True), left_index=True, right_index=True).head()


Unnamed: 0_level_0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,year,tmID,GP,G,A,Pts,SOG
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,1997,ANA,3.0,0.0,0.0,0.0,1.0
aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,1998,ANA,73.0,3.0,5.0,8.0,61.0
aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,1999,ANA,63.0,7.0,11.0,18.0,102.0
aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,2000,ANA,12.0,1.0,1.0,2.0,18.0
abdelju01,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon,2007,DET,2.0,0.0,0.0,0.0,6.0


In [15]:
scoring = scoring.reset_index(drop=True)
pd.merge(master, scoring, left_index=True, right_on="playerID").head()


Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG
0,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0
1,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0
2,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1999,ANA,63.0,7.0,11.0,18.0,102.0
3,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,2000,ANA,12.0,1.0,1.0,2.0,18.0
4,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon,abdelju01,2007,DET,2.0,0.0,0.0,0.0,6.0


In [16]:
print(
    pd.merge(master, scoring, left_index=True, right_on="playerID").shape,
    pd.merge(master, scoring, left_index=True, right_on="playerID", how="right").shape
)


(28616, 17) (28616, 17)


In [34]:
master = pd.read_pickle(os.path.join('data', 'modified', 'master.pickle'))
print(master.shape,scoring.shape)

(4627, 9) (28616, 8)


In [35]:
# Drop random records
master2 = master.drop(master.sample(5).index)

print(
    'Inner Join: ',
    pd.merge(master2, scoring, left_index=True, right_on='playerID').shape,
    '\nRight Join: ',
    pd.merge(master2, scoring, left_index=True, right_on='playerID', how="right").shape
)


Inner Join:  (28595, 17) 
Right Join:  (28616, 17)


In [36]:
merged = pd.merge(master2, scoring, left_index=True, right_on="playerID", how="right", indicator=True)
merged.head()


Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG,_merge
0,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0,both
1,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0,both
2,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1999,ANA,63.0,7.0,11.0,18.0,102.0,both
3,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,2000,ANA,12.0,1.0,1.0,2.0,18.0,both
4,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon,abdelju01,2007,DET,2.0,0.0,0.0,0.0,6.0,both


In [37]:
merged.shape


(28616, 18)

In [38]:
merged['_merge'].value_counts()


both          28595
right_only       21
left_only         0
Name: _merge, dtype: int64

In [39]:
merged[merged["_merge"] == "right_only"].head()

Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG,_merge
5971,,,,,,,,,,donatcl01,1989,MNS,25.0,3.0,3.0,6.0,25.0,right_only
5972,,,,,,,,,,donatcl01,1991,BOS,10.0,0.0,1.0,1.0,7.0,right_only
12848,,,,,,,,,,kopitan01,2006,LAK,72.0,20.0,41.0,61.0,193.0,right_only
12849,,,,,,,,,,kopitan01,2007,LAK,82.0,32.0,45.0,77.0,201.0,right_only
12850,,,,,,,,,,kopitan01,2008,LAK,82.0,27.0,39.0,66.0,234.0,right_only


In [40]:
scoring2 = scoring.drop(scoring.sample(100).index)
merged = pd.merge(master2, scoring2, left_index=True, right_on="playerID", how="outer", indicator=True)


In [42]:
merged[(merged['_merge'] == 'left_only') | (merged['_merge'] == 'right_only')].sample(10)


Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG,_merge
5972.0,,,,,,,,,,donatcl01,1991.0,BOS,10.0,0.0,1.0,1.0,7.0,right_only
,Jeremy,Yablonski,R,1980.0,3.0,21.0,Canada,SK,Meadow Lake,yabloje01,,,,,,,,left_only
,Russ,Adam,C,1961.0,5.0,5.0,Canada,ON,Windsor,adamru01,,,,,,,,left_only
12848.0,,,,,,,,,,kopitan01,2006.0,LAK,72.0,20.0,41.0,61.0,193.0,right_only
12852.0,,,,,,,,,,kopitan01,2010.0,LAK,75.0,25.0,48.0,73.0,233.0,right_only
23135.0,,,,,,,,,,seiliri01,1981.0,BUF,57.0,22.0,25.0,47.0,112.0,right_only
23138.0,,,,,,,,,,seiliri01,1984.0,BUF,73.0,16.0,15.0,31.0,118.0,right_only
22046.0,,,,,,,,,,rominda01,1999.0,TBL,3.0,0.0,1.0,1.0,0.0,right_only
,Ryan,Barnes,L,1980.0,1.0,30.0,Canada,ON,Dunnville,barnery01,,,,,,,,left_only
23139.0,,,,,,,,,,seiliri01,1985.0,BUF,69.0,12.0,13.0,25.0,85.0,right_only


In [43]:
merged[merged['_merge'].str.endswith('only')].sample(10)


Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG,_merge
14329.0,,,,,,,,,,lefebgu01,2005.0,PIT,9.0,0.0,0.0,0.0,3.0,right_only
22046.0,,,,,,,,,,rominda01,1999.0,TBL,3.0,0.0,1.0,1.0,0.0,right_only
14328.0,,,,,,,,,,lefebgu01,2002.0,PIT,12.0,2.0,4.0,6.0,14.0,right_only
23139.0,,,,,,,,,,seiliri01,1985.0,BUF,69.0,12.0,13.0,25.0,85.0,right_only
23135.0,,,,,,,,,,seiliri01,1981.0,BUF,57.0,22.0,25.0,47.0,112.0,right_only
,Russ,Adam,C,1961.0,5.0,5.0,Canada,ON,Windsor,adamru01,,,,,,,,left_only
23140.0,,,,,,,,,,seiliri01,1986.0,DET,74.0,3.0,8.0,11.0,35.0,right_only
23136.0,,,,,,,,,,seiliri01,1982.0,BUF,75.0,19.0,22.0,41.0,127.0,right_only
5971.0,,,,,,,,,,donatcl01,1989.0,MNS,25.0,3.0,3.0,6.0,25.0,right_only
23138.0,,,,,,,,,,seiliri01,1984.0,BUF,73.0,16.0,15.0,31.0,118.0,right_only


In [44]:
pd.merge(master, scoring, left_index=True, right_on="playerID", validate="1:m").head()


Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG
0,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0
1,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0
2,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1999,ANA,63.0,7.0,11.0,18.0,102.0
3,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,2000,ANA,12.0,1.0,1.0,2.0,18.0
4,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon,abdelju01,2007,DET,2.0,0.0,0.0,0.0,6.0


In [45]:
try:
    pd.merge(master, scoring, left_index=True, right_on='playerID',
             validate="1:1")
except Exception as e:
    print(e)


Merge keys are not unique in right dataset; not a one-to-one merge


In [46]:
merged = pd.merge(master, scoring, left_index=True, right_on="playerID")
merged.head()


Unnamed: 0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity,playerID,year,tmID,GP,G,A,Pts,SOG
0,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0
1,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0
2,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,1999,ANA,63.0,7.0,11.0,18.0,102.0
3,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta,aaltoan01,2000,ANA,12.0,1.0,1.0,2.0,18.0
4,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon,abdelju01,2007,DET,2.0,0.0,0.0,0.0,6.0


In [47]:
merged.columns


Index(['firstName', 'lastName', 'pos', 'birthYear', 'birthMon', 'birthDay',
       'birthCountry', 'birthState', 'birthCity', 'playerID', 'year', 'tmID',
       'GP', 'G', 'A', 'Pts', 'SOG'],
      dtype='object')

In [50]:
merged = merged.filter(regex="^(?!(birth)).*")
merged.head()

Unnamed: 0,firstName,lastName,pos,playerID,year,tmID,GP,G,A,Pts,SOG
0,Antti,Aalto,C,aaltoan01,1997,ANA,3.0,0.0,0.0,0.0,1.0
1,Antti,Aalto,C,aaltoan01,1998,ANA,73.0,3.0,5.0,8.0,61.0
2,Antti,Aalto,C,aaltoan01,1999,ANA,63.0,7.0,11.0,18.0,102.0
3,Antti,Aalto,C,aaltoan01,2000,ANA,12.0,1.0,1.0,2.0,18.0
4,Justin,Abdelkader,L,abdelju01,2007,DET,2.0,0.0,0.0,0.0,6.0


In [51]:
merged.to_pickle(os.path.join("data", 'modified', 'scoring_merged.pickle'))
merged.to_csv(os.path.join("data", 'modified', 'scoring_merged.csv'))


In [52]:
teams.sample(2)


Unnamed: 0,year,tmID,name
988,1992,MNS,Minnesota North Stars
849,1985,TOR,Toronto Maple Leafs


In [53]:
team_splits.sample(2)


Unnamed: 0,year,tmID,SepW,SepL,SepT,SepOL,OctW,OctL,OctT,OctOL,...,FebT,FebOL,MarW,MarL,MarT,MarOL,AprW,AprL,AprT,AprOL
1020,1993,PHI,,,,,9.0,3.0,0.0,,...,1.0,,4.0,7.0,3.0,,2.0,1.0,3.0,
1382,2007,LAK,1.0,1.0,,0.0,5.0,6.0,,0.0,...,,1.0,5.0,6.0,,3.0,1.0,2.0,,0.0


In [54]:
teams2 = teams[['tmID', 'name']]
teams2 = teams2.drop_duplicates()


In [55]:
teams2['tmID'].value_counts().head()


CHI    2
ANA    1
QUE    1
NJD    1
NYI    1
Name: tmID, dtype: int64

In [56]:
teams2.loc[teams2['tmID'] == "CHI"]


Unnamed: 0,tmID,name
730,CHI,Chicago Black Hawks
856,CHI,Chicago Blackhawks


In [57]:
teams2 = teams2[teams2['tmID'] == 'CHI']
team_splits2 = team_splits[team_splits['tmID'] == 'CHI'].sample(2)
team_splits2


Unnamed: 0,year,tmID,SepW,SepL,SepT,SepOL,OctW,OctL,OctT,OctOL,...,FebT,FebOL,MarW,MarL,MarT,MarOL,AprW,AprL,AprT,AprOL
793,1983,CHI,,,,,8.0,4.0,0.0,,...,2.0,,4.0,9.0,0.0,,1.0,0.0,0.0,
919,1989,CHI,,,,,9.0,5.0,1.0,,...,0.0,,5.0,8.0,2.0,,1.0,0.0,0.0,


In [59]:
pd.merge(teams2, team_splits2, on='tmID')


Unnamed: 0,tmID,name,year,SepW,SepL,SepT,SepOL,OctW,OctL,OctT,...,FebT,FebOL,MarW,MarL,MarT,MarOL,AprW,AprL,AprT,AprOL
0,CHI,Chicago Black Hawks,1983,,,,,8.0,4.0,0.0,...,2.0,,4.0,9.0,0.0,,1.0,0.0,0.0,
1,CHI,Chicago Black Hawks,1989,,,,,9.0,5.0,1.0,...,0.0,,5.0,8.0,2.0,,1.0,0.0,0.0,
2,CHI,Chicago Blackhawks,1983,,,,,8.0,4.0,0.0,...,2.0,,4.0,9.0,0.0,,1.0,0.0,0.0,
3,CHI,Chicago Blackhawks,1989,,,,,9.0,5.0,1.0,...,0.0,,5.0,8.0,2.0,,1.0,0.0,0.0,


In [60]:
pd.merge(team_splits, teams, left_on=['tmID', 'year'], right_on=['tmID', 'year']).head(4)


Unnamed: 0,year,tmID,SepW,SepL,SepT,SepOL,OctW,OctL,OctT,OctOL,...,FebOL,MarW,MarL,MarT,MarOL,AprW,AprL,AprT,AprOL,name
0,1980,BOS,,,,,2.0,7.0,1.0,,...,,8.0,4.0,3.0,,1.0,2.0,0.0,,Boston Bruins
1,1980,BUF,,,,,5.0,2.0,2.0,,...,,8.0,6.0,1.0,,1.0,1.0,1.0,,Buffalo Sabres
2,1980,CAL,,,,,5.0,4.0,2.0,,...,,6.0,6.0,2.0,,2.0,0.0,0.0,,Calgary Flames
3,1980,CHI,,,,,6.0,4.0,2.0,,...,,4.0,5.0,4.0,,1.0,1.0,1.0,,Chicago Black Hawks


In [68]:
try:
    result = pd.merge(team_splits, teams,
             left_on=['tmID', 'year'],
             right_on=['year', 'tmID']).head(4)
    print(result)
except Exception as e:
    print(e)

Empty DataFrame
Columns: [year_x, tmID_x, SepW, SepL, SepT, SepOL, OctW, OctL, OctT, OctOL, NovW, NovL, NovT, NovOL, DecW, DecL, DecT, DecOL, JanW, JanL, JanT, JanOL, FebW, FebL, FebT, FebOL, MarW, MarL, MarT, MarOL, AprW, AprL, AprT, AprOL, year_y, tmID_y, name]
Index: []

[0 rows x 37 columns]


In [66]:
team_splits.dtypes


year        int64
tmID     category
SepW      float64
SepL      float64
SepT      float64
SepOL     float64
OctW      float64
OctL      float64
OctT      float64
OctOL     float64
NovW      float64
NovL      float64
NovT      float64
NovOL     float64
DecW      float64
DecL      float64
DecT      float64
DecOL     float64
JanW      float64
JanL      float64
JanT      float64
JanOL     float64
FebW      float64
FebL      float64
FebT      float64
FebOL     float64
MarW      float64
MarL      float64
MarT      float64
MarOL     float64
AprW      float64
AprL      float64
AprT      float64
AprOL     float64
dtype: object

In [67]:
team_splits.dtypes


year        int64
tmID     category
SepW      float64
SepL      float64
SepT      float64
SepOL     float64
OctW      float64
OctL      float64
OctT      float64
OctOL     float64
NovW      float64
NovL      float64
NovT      float64
NovOL     float64
DecW      float64
DecL      float64
DecT      float64
DecOL     float64
JanW      float64
JanL      float64
JanT      float64
JanOL     float64
FebW      float64
FebL      float64
FebT      float64
FebOL     float64
MarW      float64
MarL      float64
MarT      float64
MarOL     float64
AprW      float64
AprL      float64
AprT      float64
AprOL     float64
dtype: object

In [69]:
pd.merge(team_splits, teams, on=["tmID", 'year']).head(4)


Unnamed: 0,year,tmID,SepW,SepL,SepT,SepOL,OctW,OctL,OctT,OctOL,...,FebOL,MarW,MarL,MarT,MarOL,AprW,AprL,AprT,AprOL,name
0,1980,BOS,,,,,2.0,7.0,1.0,,...,,8.0,4.0,3.0,,1.0,2.0,0.0,,Boston Bruins
1,1980,BUF,,,,,5.0,2.0,2.0,,...,,8.0,6.0,1.0,,1.0,1.0,1.0,,Buffalo Sabres
2,1980,CAL,,,,,5.0,4.0,2.0,,...,,6.0,6.0,2.0,,2.0,0.0,0.0,,Calgary Flames
3,1980,CHI,,,,,6.0,4.0,2.0,,...,,4.0,5.0,4.0,,1.0,1.0,1.0,,Chicago Black Hawks


In [76]:
# pd.merge(team_splits, teams, on=["tmID", 'year'])[['tmID', 'year']].value_counts()
pd.merge(team_splits, teams, on=["tmID", 'year']).head(5)

Unnamed: 0,year,tmID,SepW,SepL,SepT,SepOL,OctW,OctL,OctT,OctOL,...,FebOL,MarW,MarL,MarT,MarOL,AprW,AprL,AprT,AprOL,name
0,1980,BOS,,,,,2.0,7.0,1.0,,...,,8.0,4.0,3.0,,1.0,2.0,0.0,,Boston Bruins
1,1980,BUF,,,,,5.0,2.0,2.0,,...,,8.0,6.0,1.0,,1.0,1.0,1.0,,Buffalo Sabres
2,1980,CAL,,,,,5.0,4.0,2.0,,...,,6.0,6.0,2.0,,2.0,0.0,0.0,,Calgary Flames
3,1980,CHI,,,,,6.0,4.0,2.0,,...,,4.0,5.0,4.0,,1.0,1.0,1.0,,Chicago Black Hawks
4,1980,COR,,,,,4.0,4.0,2.0,,...,,4.0,9.0,1.0,,0.0,1.0,2.0,,Colorado Rockies


In [81]:
pd.merge(team_splits, teams, on=['tmID'], suffixes=('_team_splits', '_teams')).head(4).filter(like="year")


Unnamed: 0,year_team_splits,year_teams
0,1980,1980
1,1980,1981
2,1980,1982
3,1980,1983


In [82]:
merged = pd.merge(team_splits, teams, on=['tmID', 'year'])
merged.head()


Unnamed: 0,year,tmID,SepW,SepL,SepT,SepOL,OctW,OctL,OctT,OctOL,...,FebOL,MarW,MarL,MarT,MarOL,AprW,AprL,AprT,AprOL,name
0,1980,BOS,,,,,2.0,7.0,1.0,,...,,8.0,4.0,3.0,,1.0,2.0,0.0,,Boston Bruins
1,1980,BUF,,,,,5.0,2.0,2.0,,...,,8.0,6.0,1.0,,1.0,1.0,1.0,,Buffalo Sabres
2,1980,CAL,,,,,5.0,4.0,2.0,,...,,6.0,6.0,2.0,,2.0,0.0,0.0,,Calgary Flames
3,1980,CHI,,,,,6.0,4.0,2.0,,...,,4.0,5.0,4.0,,1.0,1.0,1.0,,Chicago Black Hawks
4,1980,COR,,,,,4.0,4.0,2.0,,...,,4.0,9.0,1.0,,0.0,1.0,2.0,,Colorado Rockies


In [83]:
merged.to_pickle(os.path.join('data', 'modified', 'team_splits_merged.pickle'))
merged.to_csv(os.path.join('data', 'modified', 'team_splits_merged.csv'))
