In [1]:
from gazpacho import get, Soup

In [2]:
url = 'https://www.capfriendly.com/'
html = get(url)

In [3]:
soup = Soup(html)

In [4]:
table = soup.find('table', {'id': 'ich'})

In [5]:
str(table)[:100]

'<table id="ich" class="sortablex tblcf index tbl sortable"><thead><tr class="column_head"><th class='

In [6]:
trs = table.find('tr', {'class': 'tmx'})

In [7]:
tr = trs[0]

In [8]:
tr.find('a', mode='first').text

'Arizona Coyotes'

In [9]:
tr.find('td', {'data-label': 'PROJECTED CAP HIT'}, strict=True).text

'$85,064,336'

In [10]:
def parse_tr(tr):
    team = tr.find('a', mode='first').text
    cap = tr.find('td', {'data-label': 'PROJECTED CAP HIT'}, strict=True).text
    cap = float(cap.replace(',', '').replace('$', ''))
    return team, cap

In [11]:
cap_hits = [parse_tr(tr) for tr in trs]

In [12]:
url = 'https://www.hockey-reference.com/friv/playoff_prob.fcgi'
html = get(url)

In [13]:
soup = Soup(html)

In [14]:
import pandas as pd

In [15]:
east = pd.read_html(str(soup.find('table')[0]))[0]
west = pd.read_html(str(soup.find('table')[1]))[0]

In [16]:
df = pd.concat([east, west])[['Team', 'W']].reset_index(drop=True)
df['W'] = df['W'].apply(pd.to_numeric, errors='coerce')
wins = df.dropna()

In [17]:
cap_hits = pd.DataFrame(cap_hits, columns=['Team', 'spend'])
df = pd.merge(wins, cap_hits, on='Team', how='left')
df['mpw'] = round(df['spend'] / df['W'] / 1_000_000, 2)
df.sort_values('mpw', ascending=True)

Unnamed: 0,Team,W,spend,mpw
9,New York Islanders,49.2,75499829.0,1.53
16,Colorado Avalanche,48.7,75122369.0,1.54
8,Washington Capitals,50.0,81007358.0,1.62
10,Carolina Hurricanes,47.1,79972980.0,1.7
0,Boston Bruins,46.4,82569596.0,1.78
11,Pittsburgh Penguins,45.6,81154248.0,1.78
19,Winnipeg Jets,42.9,76906546.0,1.79
17,St. Louis Blues,46.3,83095234.0,1.79
1,Tampa Bay Lightning,43.3,79253716.0,1.83
18,Dallas Stars,44.2,81825021.0,1.85


In [18]:
from IPython.display import HTML
HTML('<img src="https://media.giphy.com/media/oOX5qIDkzDjeo/giphy.gif">')

### Saving results

In [19]:
df.to_csv('data/mpw.csv', index=False)

In [20]:
df.head()

Unnamed: 0,Team,W,spend,mpw
0,Boston Bruins,46.4,82569596.0,1.78
1,Tampa Bay Lightning,43.3,79253716.0,1.83
2,Florida Panthers,41.7,80982506.0,1.94
3,Montreal Canadiens,40.5,77290175.0,1.91
4,Toronto Maple Leafs,41.4,94333187.0,2.28


In [21]:
df['date_fetched'] = pd.Timestamp('today')

In [22]:
df.head()

Unnamed: 0,Team,W,spend,mpw,date_fetched
0,Boston Bruins,46.4,82569596.0,1.78,2019-12-20 14:48:02.649158
1,Tampa Bay Lightning,43.3,79253716.0,1.83,2019-12-20 14:48:02.649158
2,Florida Panthers,41.7,80982506.0,1.94,2019-12-20 14:48:02.649158
3,Montreal Canadiens,40.5,77290175.0,1.91,2019-12-20 14:48:02.649158
4,Toronto Maple Leafs,41.4,94333187.0,2.28,2019-12-20 14:48:02.649158


In [23]:
import sqlite3

con = sqlite3.connect('data/mpw.db')

df.to_sql('teams', con, index=False, if_exists='append')

In [24]:
pd.read_sql('''
    select 
    * 
    from teams 
    where mpw > 2 and W < 41
    order by mpw desc
''', con)

Unnamed: 0,Team,W,spend,mpw,date_fetched
0,Detroit Red Wings,25.6,80176672.0,3.13,2019-12-20 14:48:02.649158
1,New Jersey Devils,31.8,75997114.0,2.39,2019-12-20 14:48:02.649158
2,Chicago Blackhawks,36.2,82479656.0,2.28,2019-12-20 14:48:02.649158
3,San Jose Sharks,36.3,81273913.0,2.24,2019-12-20 14:48:02.649158
4,Anaheim Ducks,36.1,78927337.0,2.19,2019-12-20 14:48:02.649158
5,Los Angeles Kings,35.3,76707496.0,2.17,2019-12-20 14:48:02.649158
6,Buffalo Sabres,39.0,83946056.0,2.15,2019-12-20 14:48:02.649158
7,Vancouver Canucks,40.1,83315460.0,2.08,2019-12-20 14:48:02.649158
8,Ottawa Senators,36.8,75730715.0,2.06,2019-12-20 14:48:02.649158
9,Calgary Flames,40.0,82012291.0,2.05,2019-12-20 14:48:02.649158
