## Notebook Overview

Take the data from notebook 4, and flatten the nested data structures so it can be put into sql-database-friendly form.


In [1]:
# Use w/ Anaconda Distribution
# For data manipulation
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
PROJ_ROOT = os.pardir

# For Scraping
from bs4 import BeautifulSoup
import requests
import time

# For munging
import re
import json

# Backoff time for large scrapes
THROTTLE_TIME = .05

# ipython magics
%load_ext watermark
%matplotlib inline

In [2]:
import missingno as msno

In [3]:
%watermark -a "Cameron Yick" -d -t -v -p pandas,seaborn,matplotlib,bs4 -g

Cameron Yick 2016-11-22 20:45:50 

CPython 2.7.12
IPython 5.1.0

pandas 0.18.1
seaborn 0.7.1
matplotlib 1.5.3
bs4 4.5.1
Git hash: 5843cde364ea00571b167c2c7dcfb5669ce09f28


In [4]:
RAW_PATH = os.path.join(PROJ_ROOT, "data", "raw", "sports.json")
RAW_PICK = os.path.join(PROJ_ROOT, "data", "raw", "harvard_sports.p")

In [5]:
sports = pd.read_pickle(RAW_PICK)

## Can we check how the size of a roster has changed each year, for each sport?

A roster is a collection of players for 1 season.

In [6]:
sports.columns

Index([u'rosters', u'sportCode', u'nSeasons'], dtype='object')

We have 36 collections of rosters, 1 for each sport. on at least 4 of those rosters, both genders are mixed together. (There may be other cases, such as female coxswains in on male crew boats).

In [8]:
len(sports)

36

In [7]:
sports.rosters

0     {u'2008-09': [[{'link': '/sports/bsb/2008-09/r...
1     {u'1989-90': [[{'link': '/sports/mbkb/1989-90/...
2     {u'2008-09': [[{'link': '/sports/mcrew-hw/2008...
3     {u'2008-09': [[{'link': '/sports/mcrew-lw/2008...
4     {u'2008-09': [[{'link': '/sports/xc/2008-09/ro...
5     {u'2008-09': [[{'link': '/sports/mfencing/2008...
6     {u'2008-09': [[{'link': '/sports/fball/2008-09...
7     {u'2008-09': [[{'link': '/sports/mgolf/2008-09...
8     {u'2008-09': [[{'link': '/sports/mice/2008-09/...
9     {u'2008-09': [[{'link': '/sports/mlax/2008-09/...
10    {u'2008-09': [[{'link': '/sports/sailing/2008-...
11    {u'2015-16': [[{'link': '/sports/skiing/2015-1...
12    {u'2016-17': [[{'link': '/sports/msoc/2016-17/...
13    {u'2008-09': [[{'link': '/sports/msquash/2008-...
14    {u'2008-09': [[{'link': '/sports/mswimdive/200...
15    {u'2008-09': [[{'link': '/sports/mten/2008-09/...
16    {u'2008-09': [[{'link': '/sports/track/2008-09...
17    {u'2008-09': [[{'link': '/sports/mvball/20

In [11]:
# The top entry for every table describes what metadata is available for players in that table.
sports.iloc[0]['rosters']['2015-16'][:1]

[[{'link': '/sports/bsb/2015-16/roster?sort=number', 'name': u'No.'},
  {'link': '/sports/bsb/2015-16/roster?sort=last_name', 'name': u'Name'},
  {'link': '/sports/bsb/2015-16/roster?sort=year', 'name': u'Yr.'},
  {'link': '/sports/bsb/2015-16/roster?sort=position', 'name': u'Position'},
  u'B/T',
  u'Ht.',
  u'Wt.',
  u'Hometown',
  u'High School']]

Before we can do anything fun, we need to attach these labels as keys onto the cell row entries.

In [43]:
def getFieldnames(rows):
    'Given an array of rows from a table, return a list of what to name all the keys in that table.'
    header = rows[0]
    names = []
    for fieldName in header:
        if type(fieldName) is dict:
            names.append(fieldName['name'])
        else:
            names.append(fieldName)
            
    return names

In [59]:

fish = getFieldnames(sports.iloc[32]['rosters']['2008-09'])
fish

[u'Name', u'Yr.', u'Event', u'Hometown', u'High School']

In [58]:
sports.iloc[32]['rosters']['2008-09'][1]

[u'',
 {'link': '/sports/wswimdive/2008-09/bios/ash_marissa',
  'name': u'Marissa Ash'},
 u'Sophomore',
 u'Diving',
 u'Birmingham, Mich.',
 u'Cranbrook Kingswood']

In [57]:
for i, val in enumerate(fish):
    print val

Name
Yr.
Event
Hometown
High School


In [None]:
# inexplicably, the headshot column sneaks into the table even though t
# the table header doesn't have a label for headshot!

In [None]:
# beware data issues: someone's weight entered as "20-0"
# http://www.gocrimson.com/sports/mbkb/1999-00/roster

In [76]:
rosters = []

for i, row in sports.iterrows(): # for each sport
    sportName = row['sportCode']
    
    for season, roster in row['rosters'].iteritems():    # check for each season
        nRoster = {}
        players = [] # new array of players for every season
        
        fieldNames = getFieldnames(roster)
        lenField = len(fieldNames)
           
        nRoster['sport'] = sportName             # this is the foreign key, when combined with the season
        nRoster['season'] = season
        nRoster['metadata'] = fieldNames
        
        # assumes that fieldnames are constant over all years for a given
        # sport.
        for person in roster[1:]: # skip over header row
            lenPerson = len(person)
            # iterate thru the tail rows of the roster, 
            nPlayer = {}
            nPlayer['sport'] = sportName
            nPlayer['season'] = season

            ## for each fieldname for the person
            for i, val in enumerate(fieldNames):
                if (lenField < lenPerson):
                    # for any sport with more cells than # of header labels
                    #, that means that a blank "picture" cell has been included.
                    #, and shoudl be skipped
                    fieldVal = person[i+1] # skip portrait
                else:
                    fieldVal = person[i]
    
                
                if type(fieldVal) is dict:
                    nPlayer[val + '_link'] = fieldVal['link']
                    nPlayer[val] = fieldVal['name']
                else:
                    nPlayer[val] = fieldVal
                    
            players.append(nPlayer)
        
        nRoster['players'] = players
        
        rosters.append(nRoster)
        

In [77]:
rosters[0]['players'][:2]

[{u'B/T': u'R/R',
  u'High School': u'Woodrow Wilson',
  u'Hometown': u'Long Beach, Calif.',
  u'Ht.': u'6-2',
  u'Name': u'Tyler Albright',
  u'Name_link': '/sports/bsb/2008-09/bios/albright_tyler',
  u'No.': u'6',
  u'Position': u'C',
  u'Wt.': u'190',
  u'Yr.': u'Jr.',
  'season': '2008-09',
  'sport': 'bsb'},
 {u'B/T': u'R/R',
  u'High School': u'St. Stephens Episcopal',
  u'Hometown': u'Austin, Texas',
  u'Ht.': u'5-10',
  u'Name': u'Cole Arledge',
  u'Name_link': '/sports/bsb/2008-09/bios/arledge_cole',
  u'No.': u'8',
  u'Position': u'C',
  u'Wt.': u'180',
  u'Yr.': u'Jr.',
  'season': '2008-09',
  'sport': 'bsb'}]

In [78]:
len(rosters)

426

In [79]:
players = []

for roster in rosters:
    players.extend(roster['players'])

In [80]:
len(players)

10896

In [81]:
pdf = pd.DataFrame(players)

In [82]:
print pdf.columns
pdf.shape

Index([        u'B/T',       u'Event', u'High School',    u'Hometown',
               u'Ht.',        u'Name',   u'Name_link',         u'No.',
          u'Position',         u'S/C',      u'Weapon',         u'Wt.',
               u'Yr.',      u'season',       u'sport'],
      dtype='object')


(10896, 15)

In [83]:
pdf.head()

Unnamed: 0,B/T,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,S/C,Weapon,Wt.,Yr.,season,sport
0,R/R,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,,,190,Jr.,2008-09,bsb
1,R/R,,St. Stephens Episcopal,"Austin, Texas",5-10,Cole Arledge,/sports/bsb/2008-09/bios/arledge_cole,8,C,,,180,Jr.,2008-09,bsb
2,R/R,,Ramapo,"Wyckoff, N.J.",5-11,Dan Berardo,/sports/bsb/2008-09/bios/berardo_dan,10,RHP,,,180,Jr.,2008-09,bsb
3,R/R,,Lakeside,"Issaquah, Wash.",6-1,Ian Bolliger,/sports/bsb/2008-09/bios/bolliger_ian,32,RHP,,,180,Sr.,2008-09,bsb
4,R/R,,Lincoln-Sudbury,"Sudbury, Mass.",5-10,Adam Cole,/sports/bsb/2008-09/bios/cole_adam,11,Right-Handed Pitcher,,,185,Senior,2008-09,bsb


In [86]:
# check which players are doing stuff every semester
pdf.groupby('Name').count().sort_values('sport', ascending=False).head(10)

Unnamed: 0_level_0,B/T,Event,High School,Hometown,Ht.,Name_link,No.,Position,S/C,Weapon,Wt.,Yr.,season,sport
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
James Leakos,0,4,8,8,0,8,0,0,0,0,0,8,8,8
Weishen Mead,0,4,8,8,0,8,0,0,0,0,0,8,8,8
Jakob Lindaas,0,4,8,8,0,8,0,0,0,0,0,8,8,8
Billy Looney,0,4,8,8,0,8,0,0,0,0,0,8,8,8
Erik Kraus,0,4,8,8,0,8,0,0,0,0,0,8,8,8
Darcy Wilson,0,4,8,8,0,8,0,0,0,0,0,8,8,8
Aisha Price,0,0,8,8,4,8,8,8,0,0,0,8,8,8
Connor Reck,0,4,8,8,0,8,0,0,0,0,0,8,8,8
Stewart Richardson,0,4,8,8,0,8,0,0,0,0,0,8,8,8
Kurt Ruegg,0,4,8,8,0,8,0,0,0,0,0,8,8,8


In [87]:
def lookupPlayer(name):
    return pdf[pdf['Name'] == name]

In [88]:
# Who is a multi sport athlete?
pdf.groupby('Name')['sport'].nunique().sort_values(ascending=False).head(10)

Name
Connor Green        3
Jason Michas        2
Dylan Trotzuk       2
Jenn Hatfield       2
Michael Hoffmann    2
Michael Hoffman     2
Tyler Cusick        2
Connor Reck         2
Catriona Stewart    2
Jeffrey Homer       2
Name: sport, dtype: int64

In [89]:
# how many of these people are there?!
multis = pdf.groupby('Name')['sport'].nunique().sort_values(ascending=False)

# there were 152 multisport athletes! Their names are
multis = multis[multis >= 2]

In [91]:
multis.shape

(152L,)

In [90]:
multis.index

Index([u'Connor Green', u'Jason Michas', u'Dylan Trotzuk', u'Jenn Hatfield',
       u'Michael Hoffmann', u'Michael Hoffman', u'Tyler Cusick',
       u'Connor Reck', u'Catriona Stewart', u'Jeffrey Homer',
       ...
       u'Johnny Marvin', u'Lukas Gemar', u'Brandon Price', u'Dan Stiles',
       u'Jorie Sullivan', u'Grace Wagner', u'Maksim Korolev',
       u'Will Battershill', u'Aaron Parker', u'Nicholas Linder'],
      dtype='object', name=u'Name', length=152)

In [24]:
# we need to make a separate table for when each of these people started yale so you can ask if there's a pattern to this!

In [93]:
lookupPlayer("James Leakos")

Unnamed: 0,B/T,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,S/C,Weapon,Wt.,Yr.,season,sport
2101,,,Holy Cross,"Saskatoon, Saskatchewan",,James Leakos,/sports/xc/2012-13/bios/men/leakos_james,,,,,,Junior,2012-13,xc
2233,,,Holy Cross,"Saskatoon, Saskatchewan",,James Leakos,/sports/xc/2011-12/bios/mens/leakos_james,,,,,,Sophomore,2011-12,xc
2264,,,Holy Cross,"Saskatoon, Saskatchewan",,James Leakos,/sports/xc/2013-14/bios/mens/leakos_james,,,,,,Senior,2013-14,xc
2297,,,Holy Cross,"Saskatoon, Saskatchewan",,James Leakos,/sports/xc/2010-11/bios/Mens/leakos_james,,,,,,Freshman,2010-11,xc
6371,,Distance,Holy Cross,"Saskatoon, Saskatchewan",,James Leakos,/sports/track/2012-13/bios/men/leakos_james,,,,,,Junior,2012-13,track
6733,,Distance,Holy Cross,"Saskatoon, Saskatchewan",,James Leakos,/sports/track/2011-12/bios/mens/leakos_james,,,,,,Sophomore,2011-12,track
6796,,Distance,Holy Cross,"Saskatoon, Saskatchewan",,James Leakos,/sports/track/2013-14/bios/mens/leakos_james,,,,,,Senior,2013-14,track
6895,,Distance,Holy Cross,"Saskatoon, Saskatchewan",,James Leakos,/sports/track/2010-11/bios/leakos_james,,,,,,Freshman,2010-11,track


In [92]:
lookupPlayer("Connor Green")

Unnamed: 0,B/T,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,S/C,Weapon,Wt.,Yr.,season,sport
2129,,,Pittsford Mendon,"Pittsford, N.Y.",,Connor Green,/sports/xc/2014-15/bios/mens/Green_Connor,,,,,,Freshman,2014-15,xc
2202,,,Pittsford Mendon,"Pittsford, N.Y.",,Connor Green,/sports/xc/2015-16/bios/Mens/Green_Connor,,,,,,Sophomore,2015-16,xc
5517,,Nordic,Pittsford Mendon,"Pittsford, N.Y.",,Connor Green,/sports/skiing/2015-16/bios/green_connor_fl1v,,,,,,Sophomore,2015-16,skiing
5547,,Nordic,Pittsford Mendon,"Pittsford, N.Y.",,Connor Green,/sports/skiing/2016-17/bios/green_connor_w6af,,,,,,Junior,2016-17,skiing
5610,,Nordic,Pittsford Mendon,"Pittsford, N.Y.",,Connor Green,/sports/skiing/2014-15/bios/green_connor_9n64,,,,,,Freshman,2014-15,skiing
6417,,Steeple,Pittsford Mendon,"Pittsford, N.Y.",,Connor Green,/sports/track/2014-15/bios/mens/Green_Connor,,,,,,Freshman,2014-15,track


### observation... harvard's multisport athletes are so far quite different from the yale combos

- track + skiing + xc
- sailing + squash

In [94]:
# We know his height from one sport
# he has different bios in different sports.
# this table is actually a "relational" table called "played". Every time someone plays in a season, they get an entry here.

# note that if the play every year, their bio might change from year to year. in some cases their major is added in their oldest bio.

lookupPlayer("Jason Michas")

Unnamed: 0,B/T,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,S/C,Weapon,Wt.,Yr.,season,sport
5357,,,Collegiate School,"New York, N.Y.",,Jason Michas,/sports/sailing/2012-13/bios/mens/michas_jason,,,,,,Senior,2012-13,sailing
5404,,,Collegiate School,"New York, N.Y.",,Jason Michas,/sports/sailing/2009-10/bios/men/michas_jason,,,,,,Freshman,2009-10,sailing
5448,,,Collegiate School,"New York, N.Y.",,Jason Michas,/sports/sailing/2011-12/bios/mens/michas_jason,,,,,,Junior,2011-12,sailing
5496,,,Collegiate School,"New York, N.Y.",,Jason Michas,/sports/sailing/2010-11/bios/mens/michas_jason,,,,,,Sophomore,2010-11,sailing
5688,,,Collegiate School,"New York, N.Y.",,Jason Michas,/sports/msquash/2012-13/bios/michas_jason_n2rj,,,,,,Senior,2012-13,msquash
5741,,,Collegiate School,"New York, N.Y.",,Jason Michas,/sports/msquash/2009-10/bios/michas_jason,,,,,,Freshman,2009-10,msquash
5776,,,Collegiate School,"New York, N.Y.",,Jason Michas,/sports/msquash/2011-12/bios/michas_jason,,,,,,Junior,2011-12,msquash
5824,,,Collegiate School,"New York, N.Y.",,Jason Michas,/sports/msquash/2010-11/bios/michas_jason,,,,,,Sophomore,2010-11,msquash


In [95]:
def splitCityReg(string):
    if string == "-":
        return ["-", "-"]
    else:
        
        pair = string.split(", ")
        if len(pair) is 2:
            return pair
        else:
            return [string,'NOREGION?']

In [96]:
pdf['City'], pdf['Region'] = zip(*pdf['Hometown'].apply(lambda x: splitCityReg(x) ))

Once again NY NY is top, but Houston in second is a surprise (possibly newton too?)

In [97]:
m_cities_regs = pdf.groupby('Hometown')['Name'].nunique().sort_values(ascending=False)
m_cities_regs

Hometown
                                    94
New York, N.Y.                      56
Houston, Texas                      33
Greenwich, Conn.                    28
San Diego, Calif.                   28
Newton, Mass.                       26
San Francisco, Calif.               25
London, England                     24
Weston, Mass.                       24
Toronto, Ont.                       23
Palo Alto, Calif.                   23
Atlanta, Ga.                        23
Los Angeles, Calif.                 22
Honolulu, Hawaii                    21
Newport Beach, Calif.               21
Washington, D.C.                    20
Dallas, Texas                       20
Cambridge, Mass.                    19
Boston, Mass.                       19
Chicago, Ill.                       19
Brooklyn, N.Y.                      19
Wellesley, Mass.                    19
Seattle, Wash.                      17
Austin, Texas                       16
Belmont, Mass.                      16
Garden City, N.Y

In [100]:
## By school, exeter is at the top once again
## these queries can be rewritten using sql

In [98]:
m_school = pdf.groupby('High School')['Name'].nunique().sort_values(ascending=False)
m_school

High School
                                  132
Phillips Exeter Academy            35
Deerfield Academy                  31
Milton Academy                     23
Belmont Hill School                19
Phillips Academy Andover           17
Boston Latin School                16
Choate Rosemary Hall               16
Weston                             15
Delbarton School                   15
Noble and Greenough School         14
Groton School                      14
Noble and Greenough                14
Corona del Mar                     13
Phillips Exeter Academy (N.H.)     13
Phillips Academy                   13
Garden City                        13
Greenwich                          12
Episcopal Academy                  12
Brunswick School                   12
Collegiate School                  12
Jesuit                             12
New Trier                          12
St. Paul's                         11
Manhasset                          11
The Bishop's School                11


In [101]:
pdf['Hometown'][0].split(",")

[u'Long Beach', u' Calif.']

In [102]:
m_city = pdf.groupby('City')['Name'].nunique().sort_values(ascending=False)
m_city

City
                     94
New York             56
Houston              34
Toronto              29
San Diego            29
Weston               29
Greenwich            28
Newton               28
London               28
San Francisco        25
Atlanta              23
Los Angeles          23
Palo Alto            23
Dallas               22
Chicago              21
Newport Beach        21
Cambridge            21
Honolulu             21
Washington           20
Brooklyn             20
Boston               20
Wellesley            19
Seattle              17
Garden City          17
Austin               16
Lexington            16
Belmont              16
Baltimore            15
New Canaan           15
Pittsburgh           15
                     ..
Newton Center         1
Newtown Square        1
Niwot                 1
No. Easton            1
Norcross              1
Norman                1
Norridgewock          1
North Attleboro       1
New Fairfield         1
New Britain           1
Mount Laure

In [None]:
# note the misspelling of sweeden (sic) and the doubling 
# of the number of people from california

In [103]:
m_reg = pdf.groupby('Region')['Name'].nunique().sort_values(ascending=False)
m_reg

Region
Calif.          588
Mass.           562
N.Y.            411
N.J.            208
NOREGION?       190
Conn.           163
Texas           158
Pa.             155
Md.             135
Fla.            128
Ill.            114
Ohio             94
Ga.              86
Minn.            80
Va.              77
Mich.            70
Wash.            62
England          62
Ont.             52
Colo.            43
Ariz.            43
N.C.             39
Australia        38
Mo.              32
Ind.             32
Hawaii           28
N.H.             28
Maine            26
B.C.             25
Vt.              25
               ... 
Nevada            1
Minn              1
Saskatchewan      1
Newfoundland      1
S.K.              1
Ore               1
Québec            1
Oreg.             1
Pakistan          1
N.B.              1
Sweeden           1
Ghana             1
Ukraine           1
Vt,               1
Hawaii.           1
Hong Kong         1
Hungary           1
IL.               1
Iceland      

lots of missing data from the earlier harvard football folks, consider dropping them for fair comparison

In [104]:
pdf[pdf['Region'] == "NOREGION?"].head(20)

Unnamed: 0,B/T,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,S/C,Weapon,Wt.,Yr.,season,sport,City,Region
310,,,,,,Clifton Crosby,/sports/mbkb/1949-50/Bios/Clifton_Crosby,,Forward,,,,Senior,1949-50,mbkb,,NOREGION?
311,,,,,,Edward Smith,/sports/mbkb/1949-50/Bios/Edward_Smith,,Forward,,,,Junior,1949-50,mbkb,,NOREGION?
312,,,,,,Gerald Murphy,/sports/mbkb/1949-50/Bios/Gerald_Murphy,,Guard,,,,Sophomore,1949-50,mbkb,,NOREGION?
313,,,,,,James Gabler,/sports/mbkb/1949-50/Bios/James_Gabler,,Center,,,,Junior,1949-50,mbkb,,NOREGION?
314,,,,,,John Rockwell,/sports/mbkb/1949-50/Bios/John_Rockwell,,Forward,,,,Senior,1949-50,mbkb,,NOREGION?
315,,,,,,Richard Covey,/sports/mbkb/1949-50/Bios/Richard_Covey,,Guard,,,,Senior,1949-50,mbkb,,NOREGION?
316,,,,,,Robert Bramhall,/sports/mbkb/1949-50/Bios/Robert_Bramhall,,Guard,,,,Senior,1949-50,mbkb,,NOREGION?
317,,,,,,William Hickey,/sports/mbkb/1949-50/Bios/William_Hickey,,Guard,,,,Sophomore,1949-50,mbkb,,NOREGION?
318,,,,,,William Prior,/sports/mbkb/1949-50/Bios/William_Prior,,Center,,,,Senior,1949-50,mbkb,,NOREGION?
427,,,Sammamish,Bellevue,6-7,David Coatsworth,/sports/mbkb/1978-79/Bios/David_Coatsworth,12.0,Forward,,,230.0,Freshman,1978-79,mbkb,Bellevue,NOREGION?


In [36]:
# if you're in canada, you get your country and your province
# sometimes people just leave the state off.

# this data should probably be cleaned by hand!
# people from big cities tend to leave the state off

In [105]:
EXPORT_PLAYERS = os.path.join(PROJ_ROOT, 'data', 'interim', 'harvard_players.json')
EXPORT_ROSTERS = os.path.join(PROJ_ROOT, 'data', 'interim', 'harvard_rosters.p')

In [106]:
pdf.to_json(EXPORT_PLAYERS)

In [107]:
rdf = pd.DataFrame(rosters)

In [109]:
rdf.shape

(426, 4)

In [110]:
# verify later that tables are the same across all years for any given sport.
rdf.head()

Unnamed: 0,metadata,players,season,sport
0,"[No., Name, Yr., Position, B/T, Ht., Wt., Home...","[{u'No.': u'6', u'Name_link': u'/sports/bsb/20...",2008-09,bsb
1,"[No., Name, Yr., Position, B/T, Ht., Wt., Home...","[{u'No.': u'20', u'Name_link': u'/sports/bsb/2...",2012-13,bsb
2,"[No., Name, Yr., Position, B/T, Ht., Wt., Home...","[{u'No.': u'20', u'Name_link': u'/sports/bsb/2...",2014-15,bsb
3,"[No., Name, Yr., Position, B/T, Ht., Wt., Home...","[{u'No.': u'1', u'Name_link': u'/sports/bsb/20...",2016-17,bsb
4,"[No., Name, Yr., Position, B/T, Ht., Wt., Home...","[{u'No.': u'6', u'Name_link': u'/sports/bsb/20...",2009-10,bsb


In [111]:
# This is going to be somewhat space inefficient for now, but it's OK
rdf.to_pickle(EXPORT_ROSTERS)

### Next steps

- we have a 'played' database with links to bios
- we need to figure out what a unique player is
- we need to unify their metadata
- there is data to parse from each of their bios
- Maybe there is text analysis to be done from their season-to-season-development
- there is entity resolution to be done with hometowns and high schools.