In [39]:
import sys
sys.path.append('C:\Anaconda3\Lib\site-packages')

#import warnings
#warnings.filterwarnings('ignore')

Getting the current NHL captains (including assistants) for each team.

In [40]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Scrape data from wiki-pages

In [41]:
url = 'https://en.wikipedia.org/wiki/List_of_current_NHL_captains_and_alternate_captains'
page = requests.get(url)
print('Got %s type object of url using the requests library' % str(type(page)))

Got <class 'requests.models.Response'> type object of url using the requests library


In [42]:
soup = BeautifulSoup(page.content, 'html.parser')
print('Fed %s type object into BeautifulSoup to create a %s type object' % (str(type(page.content)), str(type(soup))))

Fed <class 'bytes'> type object into BeautifulSoup to create a <class 'bs4.BeautifulSoup'> type object


In [43]:
page_tables = soup.findAll('table')
print('Got %s type object of length %d' % (str(type(page_tables)), len(page_tables)))

Got <class 'bs4.element.ResultSet'> type object of length 11


Each table is an object and they can be iterated over.

In [44]:
for table in page_tables:
    print(table.find('caption'))

<caption>Position abbreviations</caption>
<caption>List of current NHL Captains</caption>
<caption>List of current NHL Alternate Captains</caption>
None
None
None
None
None
None
None
None


In [45]:
# Get the captains
C_players = []
for i, n in enumerate(page_tables[1].findAll('td')):
    if i % 3 == 0:
        try:
            C_players.append(n.findAll('a')[0].text)
        except:
            # N/A entry
            print('Skipping entry:', n.contents)
        
C_players

Skipping entry: ['\n', <p><i>Vacant</i></p>, '\n']
Skipping entry: ['\n', <p><i>Vacant</i></p>, '\n']
Skipping entry: ['\n', <p><i>Vacant</i></p>, '\n']
Skipping entry: ['\n', <p><i>Vacant</i></p>, '\n']


['Ryan Getzlaf',
 'Shane Doan',
 'Zdeno Chara',
 'Brian Gionta',
 'Mark Giordano',
 'Jonathan Toews',
 'Gabriel Landeskog',
 'Nick Foligno',
 'Jamie Benn',
 'Henrik Zetterberg',
 'Anze Kopitar',
 'Mikko Koivu',
 'Max Pacioretty',
 'Mike Fisher',
 'Andy Greene',
 'John Tavares',
 'Ryan McDonagh',
 'Erik Karlsson',
 'Claude Giroux',
 'Sidney Crosby',
 'Alex Pietrangelo',
 'Joe Pavelski',
 'Steven Stamkos',
 'Henrik Sedin',
 'Alexander Ovechkin',
 'Blake Wheeler']

In [46]:
# Get the assistants
A_players = []
for i, n in enumerate(page_tables[2].findAll('td')):
    if i % 3 == 0:
        try:
            A_players.append(n.findAll('a')[0].text)
        except:
            # N/A entry
            print('Skipping entry:', n.contents)
        if A_players[-1] == '[a]':
            # N/A entry
            print('Skipping entry:', n.contents)
            A_players.pop()
            
A_players

Skipping entry: [<i>Vacant</i>]
Skipping entry: ['rotating', <sup class="reference" id="cite_ref-Rotating_63-0"><a href="#cite_note-Rotating-63">[a]</a></sup>, <sup class="reference" id="cite_ref-blues_64-0"><a href="#cite_note-blues-64">[63]</a></sup>]
Skipping entry: ['rotating', <sup class="reference" id="cite_ref-Rotating_63-1"><a href="#cite_note-Rotating-63">[a]</a></sup>]


['Corey Perry',
 'Ryan Kesler',
 'Oliver Ekman-Larsson',
 'Martin Hanzal',
 'Patrice Bergeron',
 'David Krejci',
 'Tyler Ennis',
 'Josh Gorges',
 "Ryan O'Reilly",
 'Sean Monahan',
 'Dennis Wideman',
 'Justin Faulk',
 'Jordan Staal',
 'Duncan Keith',
 'Brent Seabrook',
 'Jarome Iginla',
 'Cody McLeod',
 'Brandon Dubinsky',
 'Boone Jenner',
 'Tyler Seguin',
 'Niklas Kronwall',
 'Jordan Eberle',
 'Andrew Ference',
 'Ryan Nugent-Hopkins',
 'Jussi Jokinen',
 'Derek MacKenzie',
 'Drew Doughty',
 'Zach Parise',
 'Ryan Suter',
 'Brendan Gallagher',
 'Andrei Markov',
 'Tomas Plekanec',
 'Shea Weber',
 'Roman Josi',
 'James Neal',
 'Michael Cammalleri',
 'Patrik Elias',
 'Adam Henrique',
 'Travis Zajac',
 'Daniel Girardi',
 'Marc Staal',
 'Derek Stepan',
 'Chris Neil',
 'Kyle Turris',
 'Wayne Simmonds',
 'Mark Streit',
 'Chris Kunitz',
 'Evgeni Malkin',
 'Logan Couture',
 'Joe Thornton',
 'Ryan Callahan',
 'Tyler Bozak',
 'Matt Hunwick',
 'Alexandre Burrows',
 'Daniel Sedin',
 'Nicklas Backstrom

In [47]:
len(C_players) + len(A_players)

85

Getting the world cup rosters

In [48]:
url = 'https://en.wikipedia.org/wiki/2016_World_Cup_of_Hockey_rosters'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [49]:
page_tables = soup.findAll('table')

In [50]:
teams = [' '.join(soup.findAll('ul')[1].findAll('li')[i].text.split()[1:])
         for i in range(4)]
teams = teams + [' '.join(soup.findAll('ul')[2].findAll('li')[i].text.split()[1:])
                 for i in range(4)]
teams

['Canada',
 'Czech Republic',
 'Team Europe',
 'United States',
 'Finland',
 'Team North America',
 'Russia',
 'Sweden']

In [51]:
page_tables[5].findAll('tr')[1].findAll('a')[1].text

'John Gibson'

In [52]:
players = []

for i, table in enumerate(page_tables):
    
    p = []
    rows = table.findAll('tr')
    
    if i in (0, 1, 3, 4, 6, 7):
        index = 0
    elif i in (2, 5):
        index = 1
    else:
        # Skip to the next table
        continue

    for row in rows:
        try:
            p.append(row.findAll('a')[index].text)
        except:
            # N/A entry
            print('Skipping entry:', row.contents)
            
    players.append(p)
    
players

Skipping entry: ['\n', <th style="width: 4%;">No.</th>, '\n', <th style="width: 4%;">Pos.</th>, '\n', <th style="width: 14%;">Name</th>, '\n', <th style="width: 12%;">Height</th>, '\n', <th style="width: 11%;">Weight</th>, '\n', <th style="width: 14%;">Birthdate</th>, '\n', <th style="width: 16%;">Team</th>, '\n']
Skipping entry: ['\n', <th style="width: 4%;">No.</th>, '\n', <th style="width: 4%;">Pos.</th>, '\n', <th style="width: 14%;">Name</th>, '\n', <th style="width: 12%;">Height</th>, '\n', <th style="width: 11%;">Weight</th>, '\n', <th style="width: 14%;">Birthdate</th>, '\n', <th style="width: 16%;">Team</th>, '\n']
Skipping entry: ['\n', <th style="width: 4%;">No.</th>, '\n', <th style="width: 4%;">Pos.</th>, '\n', <th style="width: 14%;">Name</th>, '\n', <th style="width: 12%;">Height</th>, '\n', <th style="width: 11%;">Weight</th>, '\n', <th style="width: 14%;">Birthdate</th>, '\n', <th style="width: 16%;">Team</th>, '\n']
Skipping entry: ['\n', <th style="width: 4%;">No.</t

[['Corey Crawford',
  'Braden Holtby',
  'Carey Price',
  'Jay Bouwmeester',
  'Brent Burns',
  'Drew Doughty',
  'Jake Muzzin',
  'Alex Pietrangelo',
  'Marc-Édouard Vlasic',
  'Shea Weber',
  'Patrice Bergeron',
  'Logan Couture',
  'Sidney Crosby',
  'Matt Duchene',
  'Ryan Getzlaf',
  'Claude Giroux',
  'Brad Marchand',
  "Ryan O'Reilly",
  'Corey Perry',
  'Steven Stamkos',
  'John Tavares',
  'Joe Thornton',
  'Jonathan Toews'],
 ['Petr Mrázek',
  'Michal Neuvirth',
  'Ondřej Pavelec',
  'Michal Jordán',
  'Michal Kempný',
  'Tomáš Kundrátek',
  'Zbyněk Michálek',
  'Jakub Nakládal',
  'Roman Polák',
  'Andrej Šustr',
  'Michal Birner',
  'Roman Červenka',
  'Radek Faksa',
  'Michael Frolík',
  'Martin Hanzal',
  'Aleš Hemský',
  'Dmitri Jaškin',
  'Milan Michálek',
  'Ondřej Palát',
  'David Pastrňák',
  'Tomáš Plekanec',
  'Vladimír Sobotka',
  'Jakub Voráček'],
 ['Thomas Greiss',
  'Philipp Grubauer',
  'Jaroslav Halák',
  'Zdeno Chára',
  'Christian Ehrhoff',
  'Roman Josi',


In [53]:
# Converting to ASCII
from unidecode import unidecode
players = [[unidecode(p) for p in p_list] for p_list in players]
players

[['Corey Crawford',
  'Braden Holtby',
  'Carey Price',
  'Jay Bouwmeester',
  'Brent Burns',
  'Drew Doughty',
  'Jake Muzzin',
  'Alex Pietrangelo',
  'Marc-Edouard Vlasic',
  'Shea Weber',
  'Patrice Bergeron',
  'Logan Couture',
  'Sidney Crosby',
  'Matt Duchene',
  'Ryan Getzlaf',
  'Claude Giroux',
  'Brad Marchand',
  "Ryan O'Reilly",
  'Corey Perry',
  'Steven Stamkos',
  'John Tavares',
  'Joe Thornton',
  'Jonathan Toews'],
 ['Petr Mrazek',
  'Michal Neuvirth',
  'Ondrej Pavelec',
  'Michal Jordan',
  'Michal Kempny',
  'Tomas Kundratek',
  'Zbynek Michalek',
  'Jakub Nakladal',
  'Roman Polak',
  'Andrej Sustr',
  'Michal Birner',
  'Roman Cervenka',
  'Radek Faksa',
  'Michael Frolik',
  'Martin Hanzal',
  'Ales Hemsky',
  'Dmitri Jaskin',
  'Milan Michalek',
  'Ondrej Palat',
  'David Pastrnak',
  'Tomas Plekanec',
  'Vladimir Sobotka',
  'Jakub Voracek'],
 ['Thomas Greiss',
  'Philipp Grubauer',
  'Jaroslav Halak',
  'Zdeno Chara',
  'Christian Ehrhoff',
  'Roman Josi',


In [54]:
# We can pair them up like this
for t, p in zip(teams, players):
    print(t, p[:2], '...')

Canada ['Corey Crawford', 'Braden Holtby'] ...
Czech Republic ['Petr Mrazek', 'Michal Neuvirth'] ...
Team Europe ['Thomas Greiss', 'Philipp Grubauer'] ...
United States ['Ben Bishop', 'Jonathan Quick'] ...
Finland ['Mikko Koskinen', 'Tuukka Rask'] ...
Team North America ['John Gibson', 'Connor Hellebuyck'] ...
Russia ['Sergei Bobrovsky', 'Semyon Varlamov'] ...
Sweden ['Jhonas Enroth', 'Henrik Lundqvist'] ...


### Determine the number of captains per team

In [55]:
def get_captain_number(team):
    ''' Returns the number of captains & assistant
    captains in a given set of players.'''
    
    N_C, N_A = 0, 0
    for player in team:
        if player in C_players:
            N_C += 1
        elif player in A_players:
            N_A += 1
            
    return N_C, N_A

def get_captain_names(team):
    ''' Returns the number of captains & assistant
    captains in a given set of players.'''
    
    N_C, N_A = [], []
    for player in team:
        if player in C_players:
            N_C.append(player)
        elif player in A_players:
            N_A.append(player)
            
    return N_C, N_A

How many in the entire tournament?

In [56]:
# Flatten player list and feed into the function
all_players = [p for p_list in players for p in p_list]
print('(#C, #A)')
print(get_captain_number(all_players))
print('')
print('(%C, %A)')
print('%.2f %.2f' %
     (get_captain_number(all_players)[0] / len(C_players),
      get_captain_number(all_players)[1] / len(A_players)))

(#C, #A)
(18, 25)

(%C, %A)
0.69 0.42


In [57]:
N_captains = list(map(get_captain_number, players))
list(zip(teams, N_captains))

[('Canada', (7, 7)),
 ('Czech Republic', (0, 2)),
 ('Team Europe', (2, 2)),
 ('United States', (4, 6)),
 ('Finland', (1, 1)),
 ('Team North America', (0, 2)),
 ('Russia', (1, 2)),
 ('Sweden', (3, 3))]

In [58]:
captains = list(map(get_captain_names, players))
list(zip(teams, captains))

[('Canada',
  (['Alex Pietrangelo',
    'Sidney Crosby',
    'Ryan Getzlaf',
    'Claude Giroux',
    'Steven Stamkos',
    'John Tavares',
    'Jonathan Toews'],
   ['Drew Doughty',
    'Shea Weber',
    'Patrice Bergeron',
    'Logan Couture',
    "Ryan O'Reilly",
    'Corey Perry',
    'Joe Thornton'])),
 ('Czech Republic', ([], ['Martin Hanzal', 'Tomas Plekanec'])),
 ('Team Europe',
  (['Zdeno Chara', 'Anze Kopitar'], ['Roman Josi', 'Mark Streit'])),
 ('United States',
  (['Ryan McDonagh', 'Max Pacioretty', 'Joe Pavelski', 'Blake Wheeler'],
   ['Dustin Byfuglien',
    'Ryan Suter',
    'Brandon Dubinsky',
    'Ryan Kesler',
    'Zach Parise',
    'Derek Stepan'])),
 ('Finland', (['Mikko Koivu'], ['Jussi Jokinen'])),
 ('Team North America', ([], ['Ryan Nugent-Hopkins', 'Mark Scheifele'])),
 ('Russia', (['Alexander Ovechkin'], ['Andrei Markov', 'Evgeni Malkin'])),
 ('Sweden',
  (['Erik Karlsson', 'Gabriel Landeskog', 'Henrik Sedin'],
   ['Oliver Ekman-Larsson', 'Nicklas Backstrom', '

### Make bar plot

In [70]:
from bokeh.charts import Bar
from bokeh.charts.attributes import color, cat
from bokeh.charts.operations import blend
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.models.renderers import GlyphRenderer
from bokeh.models import FixedTicker
output_notebook()

In [71]:
df = pd.DataFrame(dict(zip(teams, N_captains)),
                  index=['Number of Captains', 'Number of Alternate Captains'])

In [72]:
df = df.T
df['Total Number of Captains'] = df.apply(lambda x: x[0] + x[1], axis=1)
df['Team'] = df.index
df['C'] = df.Team.map(dict(zip(teams, [', '.join(c[0]) for c in captains])))
df['A'] = df.Team.map(dict(zip(teams, [', '.join(c[1]) for c in captains])))
df.sort_values(by='Total Number of Captains', ascending=False, inplace=True)
df

Unnamed: 0,Number of Captains,Number of Alternate Captains,Total Number of Captains,Team,C,A
Canada,7,7,14,Canada,"Alex Pietrangelo, Sidney Crosby, Ryan Getzlaf,...","Drew Doughty, Shea Weber, Patrice Bergeron, Lo..."
United States,4,6,10,United States,"Ryan McDonagh, Max Pacioretty, Joe Pavelski, B...","Dustin Byfuglien, Ryan Suter, Brandon Dubinsky..."
Sweden,3,3,6,Sweden,"Erik Karlsson, Gabriel Landeskog, Henrik Sedin","Oliver Ekman-Larsson, Nicklas Backstrom, Danie..."
Team Europe,2,2,4,Team Europe,"Zdeno Chara, Anze Kopitar","Roman Josi, Mark Streit"
Russia,1,2,3,Russia,Alexander Ovechkin,"Andrei Markov, Evgeni Malkin"
Czech Republic,0,2,2,Czech Republic,,"Martin Hanzal, Tomas Plekanec"
Finland,1,1,2,Finland,Mikko Koivu,Jussi Jokinen
Team North America,0,2,2,Team North America,,"Ryan Nugent-Hopkins, Mark Scheifele"


In [80]:
bar = Bar(df,
          values=blend('Number of Captains', 'Number of Alternate Captains',
                       name='Number of Captains', labels_name='caps'),
          stack=cat(columns='caps', sort=False),
          label=cat(columns='Team', sort=False),
          color=color(columns='caps', palette=['OrangeRed', 'Orange'], sort=False),
          title='2016 World Cup of Hockey NHL Captains',
          legend='top_right', plot_width=1000, plot_height=1000)
#           tools='hover')

# hover = bar.select(dict(type=HoverTool))
# hover.tooltips = [('Number' ,'@x')]
bar.yaxis[0].ticker=FixedTicker(ticks=np.arange(1, 15, 1))
show(bar)

### List of captains

In [79]:
for T, C, A in zip(df.Team, df.C, df.A):
    print(T, '\nC -', C, '\nA -', A, '\n')

Canada 
C - Alex Pietrangelo, Sidney Crosby, Ryan Getzlaf, Claude Giroux, Steven Stamkos, John Tavares, Jonathan Toews 
A - Drew Doughty, Shea Weber, Patrice Bergeron, Logan Couture, Ryan O'Reilly, Corey Perry, Joe Thornton 

United States 
C - Ryan McDonagh, Max Pacioretty, Joe Pavelski, Blake Wheeler 
A - Dustin Byfuglien, Ryan Suter, Brandon Dubinsky, Ryan Kesler, Zach Parise, Derek Stepan 

Sweden 
C - Erik Karlsson, Gabriel Landeskog, Henrik Sedin 
A - Oliver Ekman-Larsson, Nicklas Backstrom, Daniel Sedin 

Team Europe 
C - Zdeno Chara, Anze Kopitar 
A - Roman Josi, Mark Streit 

Russia 
C - Alexander Ovechkin 
A - Andrei Markov, Evgeni Malkin 

Czech Republic 
C -  
A - Martin Hanzal, Tomas Plekanec 

Finland 
C - Mikko Koivu 
A - Jussi Jokinen 

Team North America 
C -  
A - Ryan Nugent-Hopkins, Mark Scheifele 



### Testing bokeh tooltips

In [64]:
[c[0] for c in N_captains] + [c[1] for c in N_captains]

[7, 0, 2, 4, 1, 0, 1, 3, 7, 2, 2, 6, 1, 2, 2, 3]

In [65]:
df = pd.DataFrame({'Teams': ['Canada', 'Canada', 'USA', 'USA'],
                   'Number': [7, 6, 4, 6],
                   'Type': ['Captains', 'Alternates', 'Captains', 'Alternates'],
                   'Names': ['Alex Pietrangelo, Sidney Crosby, Ryan Getzlaf, Claude Giroux, Steven Stamkos, John Tavares, Jonathan Toews',
                             "Drew Doughty, Patrice Bergeron, Logan Couture, Ryan O'Reilly, Corey Perry, Joe Thornton",
                             'Ryan McDonagh, Max Pacioretty, Joe Pavelski, Blake Wheeler',
                             'Dustin Byfuglien, Ryan Suter, Brandon Dubinsky, Ryan Kesler, Zach Parise, Derek Stepan']})
df

Unnamed: 0,Names,Number,Teams,Type
0,"Alex Pietrangelo, Sidney Crosby, Ryan Getzlaf,...",7,Canada,Captains
1,"Drew Doughty, Patrice Bergeron, Logan Couture,...",6,Canada,Alternates
2,"Ryan McDonagh, Max Pacioretty, Joe Pavelski, B...",4,USA,Captains
3,"Dustin Byfuglien, Ryan Suter, Brandon Dubinsky...",6,USA,Alternates


In [66]:
df = pd.DataFrame()
df['Teams'] = teams * 2
df['Number'] = [c[0] for c in N_captains] + [c[1] for c in N_captains]
df['Type'] = ['Captains'] * 8 + ['Alternates'] * 8
df['Names'] = [', '.join(c[0]) for c in captains] +\
              [', '.join(c[1]) for c in captains]
totals = df.groupby('Teams').sum()
totals = dict(zip(totals.index, totals.Number))
df['Total'] = df.Teams.map(totals)
df.sort_values(by='Total', ascending=False, inplace=True)
df

Unnamed: 0,Teams,Number,Type,Names,Total
0,Canada,7,Captains,"Alex Pietrangelo, Sidney Crosby, Ryan Getzlaf,...",14
8,Canada,7,Alternates,"Drew Doughty, Shea Weber, Patrice Bergeron, Lo...",14
3,United States,4,Captains,"Ryan McDonagh, Max Pacioretty, Joe Pavelski, B...",10
11,United States,6,Alternates,"Dustin Byfuglien, Ryan Suter, Brandon Dubinsky...",10
7,Sweden,3,Captains,"Erik Karlsson, Gabriel Landeskog, Henrik Sedin",6
15,Sweden,3,Alternates,"Oliver Ekman-Larsson, Nicklas Backstrom, Danie...",6
2,Team Europe,2,Captains,"Zdeno Chara, Anze Kopitar",4
10,Team Europe,2,Alternates,"Roman Josi, Mark Streit",4
6,Russia,1,Captains,Alexander Ovechkin,3
14,Russia,2,Alternates,"Andrei Markov, Evgeni Malkin",3


In [67]:
# source = ColumnDataSource(df)

bar = Bar(df, label='Teams', values='Number',
          agg='sum', stack='Type',
          color=color(columns='Type', palette=['OrangeRed', 'Orange'], sort=False),
          title='2016 World Cup of Hockey NHL Captains',
          legend='top_right',
          tooltips = [('Players', '@Names')])
# bar.select(dict(type=GlyphRenderer))
# hover = bar.select(dict(type=HoverTool))
# hover.tooltips = [('Names', '$Names')]
show(bar)