Import all the stuff we might need.

In [81]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import requests
import networkx as nx
from bs4 import BeautifulSoup, NavigableString

Get the tournament page data.

In [2]:
raw = requests.get('http://www.sports-reference.com/cbb/postseason/2017-ncaa.html').text
soup = BeautifulSoup(raw,'lxml')

Parse out all the divisions.

In [4]:
east_soup = soup.find_all('div',{'id':'east'})[0]
midwest_soup = soup.find_all('div',{'id':'midwest'})[0]
south_soup = soup.find_all('div',{'id':'south'})[0]
west_soup = soup.find_all('div',{'id':'west'})[0]

Write a function to get all the teams in each division by parsing the HTML.

In [7]:
def get_teams(division_soup):
    teams_href_list = list()
    first_round_teams = list(division_soup.find_all('div',{'class':'round'})[0].children)
    for team_soup in first_round_teams:
        if type(team_soup) != NavigableString:
            pairs = team_soup.find_all('a')
            for team in pairs:
                if team.text != 'tbd':
                    teams_href_list.append(team)
    return teams_href_list

Run the function on each division's `soup` and add them all together for a list of all teams in the 2017 tournament.

In [39]:
east_teams_2017 = get_teams(east_soup)
midwest_teams_2017 = get_teams(midwest_soup)
south_teams_2017 = get_teams(south_soup)
west_teams_2017 = get_teams(west_soup)
teams_2017 = east_teams_2017 + midwest_teams_2017 + south_teams_2017 + west_teams_2017
teams_2017

[<a href="/cbb/schools/villanova/2017.html">Villanova</a>,
 <a href="/cbb/schools/wisconsin/2017.html">Wisconsin</a>,
 <a href="/cbb/schools/virginia-tech/2017.html">Virginia Tech</a>,
 <a href="/cbb/schools/virginia/2017.html">Virginia</a>,
 <a href="/cbb/schools/north-carolina-wilmington/2017.html">North Carolina-Wilmington</a>,
 <a href="/cbb/schools/florida/2017.html">Florida</a>,
 <a href="/cbb/schools/east-tennessee-state/2017.html">ETSU</a>,
 <a href="/cbb/schools/southern-methodist/2017.html">SMU</a>,
 <a href="/cbb/schools/baylor/2017.html">Baylor</a>,
 <a href="/cbb/schools/new-mexico-state/2017.html">New Mexico State</a>,
 <a href="/cbb/schools/south-carolina/2017.html">South Carolina</a>,
 <a href="/cbb/schools/marquette/2017.html">Marquette</a>,
 <a href="/cbb/schools/duke/2017.html">Duke</a>,
 <a href="/cbb/schools/troy/2017.html">Troy</a>,
 <a href="/cbb/schools/kansas/2017.html">Kansas</a>,
 <a href="/cbb/schools/miami-fl/2017.html">Miami (FL)</a>,
 <a href="/cbb/school

Crawl all the data and save the tables you find to disk. This will take about 60 seconds.

In [40]:
for team in teams_2017:
    name = "2017-{0}".format(team.text)
    url = 'http://www.sports-reference.com/' + team['href'].replace('2017.html','2017-schedule.html')
    team_raw = requests.get(url).text
    team_soup = BeautifulSoup(team_raw,'lxml')
    df = pd.read_html(str(team_soup.find_all('table',{'class':'sortable stats_table','id':'schedule'})[0]))[0]
    df = df.set_index('G')
    df.to_csv(name + '.csv')

Alternatively, crawl the data and parse out the relevant fields and save in a giant list of results.

In [72]:
all_results = list()
for team in teams_2017:
    url = 'http://www.sports-reference.com/' + team['href'].replace('2017.html','2017-schedule.html')
    team_raw = requests.get(url).text
    team_soup = BeautifulSoup(team_raw,'lxml')
    table = team_soup.find_all('table',{'class':'sortable stats_table','id':'schedule'})[0]
    date = [i['csk'] for i in table.find_all('td',{'data-stat':'date_game'})]
    opponents = [i.text.split('\xa0')[0] for i in table.find_all('td',{'data-stat':'opp_name'})]
    results = [i.text for i in table.find_all('td',{'data-stat':'game_result'})]
    team_score = [i.text for i in table.find_all('td',{'data-stat':'pts'})]
    opponent_score = [i.text for i in table.find_all('td',{'data-stat':'opp_pts'})]
    team_results = list(zip([team.text]*len(opponents),opponents,date,results,team_score,opponent_score))[:-1]
    for team_result in team_results:
        all_results.append(team_result)

Check out what this `all_results` looks like for one team.

In [117]:
[i for i in all_results if i[0] == 'Louisville']

[('Louisville', 'Evansville', '2016-11-11', 'W', '78', '47'),
 ('Louisville', 'William & Mary', '2016-11-14', 'W', '91', '58'),
 ('Louisville', 'Long Beach State', '2016-11-17', 'W', '88', '56'),
 ('Louisville', 'Old Dominion', '2016-11-23', 'W', '68', '62'),
 ('Louisville', 'Wichita State', '2016-11-24', 'W', '62', '52'),
 ('Louisville', 'Baylor', '2016-11-25', 'L', '63', '66'),
 ('Louisville', 'Purdue', '2016-11-30', 'W', '71', '64'),
 ('Louisville', 'Grand Canyon', '2016-12-03', 'W', '79', '70'),
 ('Louisville', 'Southern Illinois', '2016-12-07', 'W', '74', '51'),
 ('Louisville', 'Texas Southern', '2016-12-10', 'W', '102', '71'),
 ('Louisville', 'Eastern Kentucky', '2016-12-17', 'W', '87', '56'),
 ('Louisville', 'Kentucky', '2016-12-21', 'W', '73', '70'),
 ('Louisville', 'Virginia', '2016-12-28', 'L', '53', '61'),
 ('Louisville', 'Indiana', '2016-12-31', 'W', '77', '62'),
 ('Louisville', 'Notre Dame', '2017-01-04', 'L', '70', '77'),
 ('Louisville', 'Georgia Tech', '2017-01-07', 'W',

Dump this into a big CSV as a backup.

In [140]:
pd.DataFrame(all_results,columns=['team','opponent','date','result','score','opp_score']).to_csv('results.csv')

Only include winners so as not to double count.

In [110]:
reduced_results = list()
for (team, opponent, date, result, score, opp_score) in all_results:
    if score > opp_score:
        reduced_results.append((team, opponent, date, score, opp_score))
        
len(all_results), len(reduced_results)

(1987, 1412)

What are the names of all the teams in the tournament?

In [111]:
tournament_teams = [i.text for i in teams_2017]

Make a directed graph if team *i* beat team *j* by *w* points. 

**TODO: Some weirdness with differentials still being negative. Just take absolute value now**

In [118]:
g = nx.DiGraph()

for (team, opponent, date, score, opp_score) in reduced_results:
    if opponent in tournament_teams:
        differential = int(score) - int(opp_score)
        if g.has_edge(team,opponent):
            g[team][opponent]['weight'] += differential
        else:
            g.add_edge(team, opponent, weight = abs(differential))
        
print("There are {0} nodes and {1} edges in the network".format(g.number_of_nodes(), g.number_of_edges()))

nx.write_gexf(g,'tournament_schedule.gexf')

There are 56 nodes and 207 edges in the network


What are the teams with the greatest weighted out-degrees, or beating the most other teams by the most points?

In [131]:
d = {}
for node in g.nodes():
    d[node] = np.sum([g[node][successor]['weight'] for successor in g.successors(node)])
    
pd.Series(d).sort_values(ascending=False)

Villanova                    146.0
UNC                          121.0
West Virginia                118.0
Butler                       112.0
Michigan                     109.0
Florida State                108.0
Baylor                       103.0
Kentucky                     102.0
Florida                       94.0
Duke                          76.0
Wisconsin                     71.0
Purdue                        70.0
Marquette                     70.0
Gonzaga                       65.0
Virginia                      64.0
Creighton                     62.0
Minnesota                     55.0
Maryland                      54.0
Oklahoma State                52.0
Michigan State                52.0
Iowa State                    51.0
Notre Dame                    50.0
Arizona                       49.0
Cincinnati                    47.0
Louisville                    45.0
Vanderbilt                    42.0
South Carolina                37.0
Kansas                        35.0
Texas Southern      

Alternatively what teams have beaten teams that beat lots of other teams?

In [135]:
pd.Series(nx.neighbor_degree.average_neighbor_degree(g,source='out',target='out',weight='weight')).sort_values(ascending=False)

Winthrop                     9.000000
Xavier                       7.777778
Texas Southern               7.000000
Creighton                    6.354839
Virginia Tech                6.291667
Notre Dame                   5.660000
Virginia                     5.656250
Kansas                       5.400000
Wisconsin                    5.352113
Seton Hall                   5.322581
Michigan State               5.230769
Northwestern                 5.200000
Purdue                       5.128571
Michigan                     5.064220
UNC                          4.876033
Miami (FL)                   4.823529
Duke                         4.723684
Louisville                   4.555556
Minnesota                    4.527273
Villanova                    4.465753
Florida State                4.250000
West Virginia                4.093220
Bucknell                     4.000000
Vanderbilt                   3.976190
Oregon                       3.931034
UCLA                         3.923077
Iowa State  