# March Madness Machine Learning

I try to predict the outcome of March Madnss matches based on the outcome of previous matches between two teams.

I used a MLP neural network classifier. I used pybrain for the classifier and pandas to sift through the data.

This is based on the [World Cup Classifer](https://github.com/fisadev/world_cup_learning/) built by [fisadev](https://github.com/fisadev/)

In [1]:
import pandas as pd
import numpy as np

# Data cleaning
Get season and seeds data

In [2]:
df = pd.read_csv('march-machine-learning-mania-2016-v2/TourneySeeds.csv')
df

Unnamed: 0,Season,Seed,Team
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374
5,1985,W06,1208
6,1985,W07,1393
7,1985,W08,1396
8,1985,W09,1439
9,1985,W10,1177


Out of curiosity, let's take a look at unique team codes in 2003

In [3]:
df[df['Season'] == 2003].Team.unique()

array([1328, 1448, 1393, 1257, 1280, 1329, 1386, 1143, 1301, 1120, 1335,
       1139, 1122, 1264, 1190, 1354, 1400, 1196, 1462, 1390, 1163, 1268,
       1277, 1261, 1345, 1160, 1423, 1140, 1360, 1407, 1358, 1411, 1421,
       1246, 1338, 1266, 1173, 1458, 1281, 1231, 1332, 1428, 1104, 1356,
       1451, 1409, 1221, 1447, 1237, 1112, 1242, 1181, 1228, 1323, 1166,
       1272, 1153, 1211, 1113, 1141, 1454, 1443, 1161, 1429, 1436])

In [4]:
df[df['Team'] == 1437]

Unnamed: 0,Season,Seed,Team
55,1985,Z08,1437
105,1986,Y10,1437
229,1988,Y06,1437
363,1990,Y12,1437
392,1991,W09,1437
642,1995,W03,1437
722,1996,X03,1437
771,1997,W04,1437
919,1999,X08,1437
1336,2005,Z05,1437


Get detailed season results (2003 onwards)

In [5]:
df2 = pd.read_csv('march-machine-learning-mania-2016-v2/RegularSeasonDetailedResults.csv')
df2

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
5,2003,11,1458,81,1186,55,H,0,26,57,...,11,12,17,6,22,8,19,4,3,25
6,2003,12,1161,80,1236,62,H,0,23,55,...,15,20,28,9,21,11,30,10,4,28
7,2003,12,1186,75,1457,61,N,0,28,62,...,17,17,23,8,25,10,15,14,8,18
8,2003,12,1194,71,1156,66,N,0,28,58,...,18,12,27,13,26,13,25,8,2,18
9,2003,12,1458,84,1296,56,H,0,32,67,...,14,7,12,9,23,10,18,1,3,18


Merge season seed data for both winner and loser teams with the deatiled data dataframe

In [6]:
games = pd.merge(df2, df, how='left', left_on=['Wteam', 'Season'], right_on=['Team', 'Season'], suffixes=('', 'W'))
games['WSeed'] = games['Seed']
del games["Seed"]
del games["Team"]
games = pd.merge(games, df, how='left', left_on=['Lteam', 'Season'], right_on=['Team', 'Season'], suffixes=('', 'L'))
games['LSeed'] = games['Seed']
del games["Seed"]
del games["Team"]
games.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf,WSeed,LSeed
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,10,22,8,18,9,2,20,Y10,W01
1,2003,10,1272,70,1393,63,N,0,26,62,...,20,20,25,7,12,8,6,16,Z07,W03
2,2003,11,1266,73,1437,61,N,0,24,58,...,23,31,22,9,12,2,5,23,Y03,
3,2003,11,1296,56,1457,50,N,0,18,38,...,15,17,20,9,19,4,3,23,,
4,2003,11,1400,77,1208,71,N,0,30,61,...,27,21,15,12,10,7,1,14,X01,


In [7]:
games.columns

Index([u'Season', u'Daynum', u'Wteam', u'Wscore', u'Lteam', u'Lscore', u'Wloc',
       u'Numot', u'Wfgm', u'Wfga', u'Wfgm3', u'Wfga3', u'Wftm', u'Wfta',
       u'Wor', u'Wdr', u'Wast', u'Wto', u'Wstl', u'Wblk', u'Wpf', u'Lfgm',
       u'Lfga', u'Lfgm3', u'Lfga3', u'Lftm', u'Lfta', u'Lor', u'Ldr', u'Last',
       u'Lto', u'Lstl', u'Lblk', u'Lpf', u'WSeed', u'LSeed'],
      dtype='object')

Begin parsing through seed data to seperate it out

Create region data as seperate columns

In [9]:
games['Wregion'] = games.WSeed.str[0]

In [10]:
games['Lregion'] = games.LSeed.str[0]

In [11]:
games.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Ldr,Last,Lto,Lstl,Lblk,Lpf,WSeed,LSeed,Wregion,Lregion
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,8,18,9,2,20,Y10,W01,Y,W
1,2003,10,1272,70,1393,63,N,0,26,62,...,25,7,12,8,6,16,Z07,W03,Z,W
2,2003,11,1266,73,1437,61,N,0,24,58,...,22,9,12,2,5,23,Y03,,Y,
3,2003,11,1296,56,1457,50,N,0,18,38,...,20,9,19,4,3,23,,,,
4,2003,11,1400,77,1208,71,N,0,30,61,...,15,12,10,7,1,14,X01,,X,


Import season data (2003 and above)

In [12]:
seasons = pd.read_csv('march-machine-learning-mania-2016-v2/Seasons.csv')
seasons = seasons[seasons['Season'] > 2002]

Fix seasons data for ease of importing into games dataframe

In [13]:
seasons['W'] = seasons.Regionw
seasons['X'] = seasons.Regionx
seasons['Y'] = seasons.Regiony
seasons['Z'] = seasons.Regionz
del seasons['Regionw']
del seasons['Regionx']
del seasons['Regiony']
del seasons['Regionz']
seasons.head()

Unnamed: 0,Season,Dayzero,W,X,Y,Z
18,2003,11/4/2002,East,South,Midwest,West
19,2004,11/3/2003,Atlanta,Phoenix,EastRutherford,StLouis
20,2005,11/1/2004,Albuquerque,Chicago,Austin,Syracuse
21,2006,10/31/2005,Atlanta,Oakland,Minneapolis,WashingtonDC
22,2007,10/30/2006,East,South,Midwest,West


In [14]:
seasons

Unnamed: 0,Season,Dayzero,W,X,Y,Z
18,2003,11/4/2002,East,South,Midwest,West
19,2004,11/3/2003,Atlanta,Phoenix,EastRutherford,StLouis
20,2005,11/1/2004,Albuquerque,Chicago,Austin,Syracuse
21,2006,10/31/2005,Atlanta,Oakland,Minneapolis,WashingtonDC
22,2007,10/30/2006,East,South,Midwest,West
23,2008,11/5/2007,East,Midwest,South,West
24,2009,11/3/2008,East,South,Midwest,West
25,2010,11/2/2009,East,South,Midwest,West
26,2011,11/1/2010,East,West,Southeast,Southwest
27,2012,10/31/2011,East,Midwest,South,West


Add Region data into games df for both winning and losing teams

In [15]:
for index, row in games.iterrows():
    try:
        games.loc[index, 'Wregion'] = seasons[seasons['Season'] == 2007][[games.loc[index, 'Wregion']]].values[0][0]
    except:
        games.loc[index, 'Wregion'] = np.nan

In [16]:
games.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Ldr,Last,Lto,Lstl,Lblk,Lpf,WSeed,LSeed,Wregion,Lregion
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,8,18,9,2,20,Y10,W01,Midwest,W
1,2003,10,1272,70,1393,63,N,0,26,62,...,25,7,12,8,6,16,Z07,W03,West,W
2,2003,11,1266,73,1437,61,N,0,24,58,...,22,9,12,2,5,23,Y03,,Midwest,
3,2003,11,1296,56,1457,50,N,0,18,38,...,20,9,19,4,3,23,,,,
4,2003,11,1400,77,1208,71,N,0,30,61,...,15,12,10,7,1,14,X01,,South,


In [17]:
for index, row in games.iterrows():
    try:
        games.loc[index, 'Lregion'] = seasons[seasons['Season'] == 2007][[games.loc[index, 'Lregion']]].values[0][0]
    except:
        games.loc[index, 'Lregion'] = np.nan

In [18]:
games.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Ldr,Last,Lto,Lstl,Lblk,Lpf,WSeed,LSeed,Wregion,Lregion
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,8,18,9,2,20,Y10,W01,Midwest,East
1,2003,10,1272,70,1393,63,N,0,26,62,...,25,7,12,8,6,16,Z07,W03,West,East
2,2003,11,1266,73,1437,61,N,0,24,58,...,22,9,12,2,5,23,Y03,,Midwest,
3,2003,11,1296,56,1457,50,N,0,18,38,...,20,9,19,4,3,23,,,,
4,2003,11,1400,77,1208,71,N,0,30,61,...,15,12,10,7,1,14,X01,,South,


Import Teams data and replace team code's with team names

In [19]:
teams = pd.read_csv('march-machine-learning-mania-2016-v2/Teams.csv')
teams.head()

Unnamed: 0,Team_Id,Team_Name
0,1101,Abilene Chr
1,1102,Air Force
2,1103,Akron
3,1104,Alabama
4,1105,Alabama A&M


In [20]:
for index, row in games.iterrows():
    games.loc[index, 'Wteam'] = teams[teams['Team_Id'] == games.loc[index, 'Wteam']].Team_Name.to_string().split("    ")[1]
    games.loc[index, 'Lteam'] = teams[teams['Team_Id'] == games.loc[index, 'Lteam']].Team_Name.to_string().split("    ")[1]

KeyboardInterrupt: 

In [2]:
games = pd.read_csv('all_games.csv')

In [3]:
games.head()

Unnamed: 0.1,Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,...,Ldr,Last,Lto,Lstl,Lblk,Lpf,WSeed,LSeed,Wregion,Lregion
0,0,2003,10,Alabama,68,Oklahoma,62,N,0,27,...,22,8,18,9,2,20,Y10,W01,Midwest,East
1,1,2003,10,Memphis,70,Syracuse,63,N,0,26,...,25,7,12,8,6,16,Z07,W03,West,East
2,2,2003,11,Marquette,73,Villanova,61,N,0,24,...,22,9,12,2,5,23,Y03,,Midwest,
3,3,2003,11,N Illinois,56,Winthrop,50,N,0,18,...,20,9,19,4,3,23,,,,
4,4,2003,11,Texas,77,Georgia,71,N,0,30,...,15,12,10,7,1,14,X01,,South,


Save games df to csv incase we lose the kernel

In [2]:
games.to_csv('all_games.csv')

NameError: name 'games' is not defined

Count nans by column

In [4]:
for c in games.columns:
    print c, len(games[c]) - games[c].count()

Unnamed: 0 0
Season 0
Daynum 0
Wteam 0
Wscore 0
Lteam 0
Lscore 0
Wloc 0
Numot 0
Wfgm 0
Wfga 0
Wfgm3 0
Wfga3 0
Wftm 0
Wfta 0
Wor 0
Wdr 0
Wast 0
Wto 0
Wstl 0
Wblk 0
Wpf 0
Lfgm 0
Lfga 0
Lfgm3 0
Lfga3 0
Lftm 0
Lfta 0
Lor 0
Ldr 0
Last 0
Lto 0
Lstl 0
Lblk 0
Lpf 0
WSeed 49914
LSeed 63332
Wregion 49914
Lregion 63332


In [5]:
len(games)

71241

In [6]:
del games['WSeed']
del games['LSeed']
del games['Wregion']
del games['Lregion']

We have very little seed data. This might pose a problem since seed data is usually highly correlated with teams winning or losing. Let's get rid of it and see what happens.

In [7]:
games.describe()

Unnamed: 0.1,Unnamed: 0,Season,Daynum,Wscore,Lscore,Numot,Wfgm,Wfga,Wfgm3,Wfga3,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
count,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,...,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0,71241.0
mean,35620.0,2009.709423,71.443467,74.720568,62.752713,0.072304,25.830126,54.698109,6.857821,17.92145,...,18.988265,12.178465,18.11339,11.317556,21.325543,11.394478,14.481029,6.08234,2.868587,19.867829
std,20565.649601,3.993369,35.203727,11.059601,10.873009,0.314147,4.676932,7.598108,2.981373,5.62757,...,5.789449,5.368745,7.166855,4.224845,4.493498,3.726841,4.462374,2.786944,2.050225,4.526861
min,0.0,2003.0,0.0,34.0,20.0,0.0,10.0,27.0,0.0,1.0,...,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0
25%,17810.0,2006.0,40.0,67.0,55.0,0.0,23.0,49.0,5.0,14.0,...,15.0,8.0,13.0,8.0,18.0,9.0,11.0,4.0,1.0,17.0
50%,35620.0,2010.0,75.0,74.0,62.0,0.0,26.0,54.0,7.0,18.0,...,19.0,12.0,18.0,11.0,21.0,11.0,14.0,6.0,3.0,20.0
75%,53430.0,2013.0,102.0,82.0,70.0,0.0,29.0,59.0,9.0,21.0,...,23.0,16.0,23.0,14.0,24.0,14.0,17.0,8.0,4.0,23.0
max,71240.0,2016.0,132.0,144.0,140.0,6.0,56.0,103.0,25.0,56.0,...,54.0,42.0,61.0,36.0,45.0,31.0,41.0,22.0,18.0,45.0


In [8]:
games_na = games.dropna()

In [9]:
import re

In [10]:
games_na.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71241 entries, 0 to 71240
Data columns (total 35 columns):
Unnamed: 0    71241 non-null int64
Season        71241 non-null int64
Daynum        71241 non-null int64
Wteam         71241 non-null object
Wscore        71241 non-null int64
Lteam         71241 non-null object
Lscore        71241 non-null int64
Wloc          71241 non-null object
Numot         71241 non-null int64
Wfgm          71241 non-null int64
Wfga          71241 non-null int64
Wfgm3         71241 non-null int64
Wfga3         71241 non-null int64
Wftm          71241 non-null int64
Wfta          71241 non-null int64
Wor           71241 non-null int64
Wdr           71241 non-null int64
Wast          71241 non-null int64
Wto           71241 non-null int64
Wstl          71241 non-null int64
Wblk          71241 non-null int64
Wpf           71241 non-null int64
Lfgm          71241 non-null int64
Lfga          71241 non-null int64
Lfgm3         71241 non-null int64
Lfga3        

I needed to add a winner column that randomized through which team won. Since it was always team 1 I decided to use a randomizer between 1 and 2 and assign the winner column value to 1 or 2 depending on the randomizer.

In [11]:
g = games_na[['Season', 'Daynum', 'Numot', 'Wloc']]
w = games_na[['Wteam', 'Wscore', 'Wfgm', 'Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor', 'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf']]
l = games_na[['Lteam', 'Lscore', 'Lfgm', 'Lfga', 'Lfgm3', 'Lfga3', 'Lftm', 'Lfta', 'Lor', 'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf']]

In [12]:
w.columns = w.columns.map(lambda x: x.strip('W'))
l.columns = l.columns.map(lambda x: x.strip('L'))

In [13]:
games_rnd = pd.DataFrame()

In [14]:
import random

In [15]:
for i in range(0, len(g)):
    who_first = random.randint(1, 3)
    if who_first == 1:
        s = pd.Series([1],index=['winner'])
        one = w.iloc[i].rename(lambda x: x + '1')
        two = l.iloc[i].rename(lambda x: x + '2')
    else:
        s = pd.Series([2],index=['winner'])
        one = l.iloc[i].rename(lambda x: x + '1')
        two = w.iloc[i].rename(lambda x: x + '2')
    temp = pd.concat([g.iloc[i], one, two, s])
    games_rnd = games_rnd.append(temp, ignore_index=True)

I created a team_stats dataframe to concatenate the average results of a team in a single season. I will be using this to gather data about teams to make a predcition.

In [16]:
team_stats_1 = games_rnd.groupby(['Season','team1'])['ast1', 'blk1', 'dr1', 'fga1', 'fga31', 'fgm1', 'fgm31', 'fta1', 'ftm1', 'or1', 'pf1', 'score1', 'stl1', 'to1'].mean().reset_index()
team_stats_2 = games_rnd.groupby(['Season','team2'])['ast2', 'blk2', 'dr2', 'fga2', 'fga32', 'fgm2', 'fgm32', 'fta2', 'ftm2', 'or2', 'pf2', 'score2', 'stl2', 'to2'].mean().reset_index()

In [17]:
team_stats_1.columns = team_stats_1.columns.map(lambda x: x.strip('1'))
team_stats_2.columns = team_stats_2.columns.map(lambda x: x.strip('2'))

In [18]:
team_stats = pd.concat([team_stats_1, team_stats_2])

In [19]:
team_stats = team_stats.groupby(['Season','team'])['ast', 'blk', 'dr', 'fga', 'fga3', 'fgm', 'fgm3', 'fta', 'ftm', 'or', 'pf', 'score', 'stl', 'to'].mean().reset_index()

In [20]:
team_stats.head()

Unnamed: 0,Season,team,ast,blk,dr,fga,fga3,fgm,fgm3,fta,ftm,or,pf,score,stl,to
0,2003.0,Air Force,13.0,1.785714,16.821429,39.785714,20.821429,19.142857,7.821429,17.107143,11.142857,4.178571,18.75,57.25,5.964286,11.428571
1,2003.0,Akron,15.358333,2.3,19.991667,55.825,16.025,27.291667,5.441667,26.3,19.333333,9.766667,20.016667,79.358333,7.291667,12.566667
2,2003.0,Alabama,11.794444,3.855556,23.522222,57.405556,20.244444,23.561111,6.166667,21.366667,15.2,13.844444,18.472222,68.488889,6.294444,12.933333
3,2003.0,Alabama A&M,14.452381,2.083333,23.184524,61.72619,20.595238,24.404762,7.470238,21.845238,15.380952,13.52381,20.232143,71.660714,9.410714,18.720238
4,2003.0,Alabama St,11.542781,3.037433,23.417112,55.588235,17.834225,23.320856,6.168449,15.724599,10.144385,12.171123,18.09893,62.954545,8.117647,16.580214


In [21]:
games_rnd.head()

Unnamed: 0,Daynum,Numot,Season,Wloc,ast1,ast2,blk1,blk2,dr1,dr2,...,pf2,score1,score2,stl1,stl2,team1,team2,to1,to2,winner
0,10.0,0.0,2003.0,N,8.0,13.0,2.0,1.0,22.0,24.0,...,22.0,62.0,68.0,9.0,7.0,Oklahoma,Alabama,18.0,23.0,2.0
1,10.0,0.0,2003.0,N,16.0,7.0,4.0,6.0,28.0,25.0,...,16.0,70.0,63.0,4.0,8.0,Memphis,Syracuse,13.0,12.0,1.0
2,11.0,0.0,2003.0,N,15.0,9.0,2.0,5.0,26.0,22.0,...,23.0,73.0,61.0,5.0,2.0,Marquette,Villanova,10.0,12.0,1.0
3,11.0,0.0,2003.0,N,11.0,9.0,2.0,3.0,19.0,20.0,...,23.0,56.0,50.0,14.0,4.0,N Illinois,Winthrop,12.0,19.0,1.0
4,11.0,0.0,2003.0,N,12.0,12.0,4.0,1.0,22.0,15.0,...,14.0,77.0,71.0,4.0,7.0,Texas,Georgia,14.0,10.0,1.0


In [46]:
games_winner = pd.DataFrame()

In [47]:
games_rnd.columns

Index([u'Daynum', u'Numot', u'Season', u'Wloc', u'ast1', u'ast2', u'blk1',
       u'blk2', u'dr1', u'dr2', u'fga1', u'fga2', u'fga31', u'fga32', u'fgm1',
       u'fgm2', u'fgm31', u'fgm32', u'fta1', u'fta2', u'ftm1', u'ftm2', u'or1',
       u'or2', u'pf1', u'pf2', u'score1', u'score2', u'stl1', u'stl2',
       u'team1', u'team2', u'to1', u'to2', u'winner'],
      dtype='object')

In [48]:
g = games_rnd[['Season', 'Daynum', 'Numot', 'Wloc', 'winner']]

In [49]:
for i in range(0, len(g)):
    one = games_rnd.iloc[i][[x for x in games_rnd.columns if '1' in x]]
    two = games_rnd.iloc[i][[x for x in games_rnd.columns if '2' in x]]
    temp = pd.concat([g.iloc[i], one, two])
    games_winner = games_winner.append(temp, ignore_index=True)

In [50]:
games_winner.head()

Unnamed: 0,Daynum,Numot,Season,Wloc,ast1,ast2,blk1,blk2,dr1,dr2,...,pf2,score1,score2,stl1,stl2,team1,team2,to1,to2,winner
0,10.0,0.0,2003.0,N,8.0,13.0,2.0,1.0,22.0,24.0,...,22.0,62.0,68.0,9.0,7.0,Oklahoma,Alabama,18.0,23.0,2.0
1,10.0,0.0,2003.0,N,16.0,7.0,4.0,6.0,28.0,25.0,...,16.0,70.0,63.0,4.0,8.0,Memphis,Syracuse,13.0,12.0,1.0
2,11.0,0.0,2003.0,N,15.0,9.0,2.0,5.0,26.0,22.0,...,23.0,73.0,61.0,5.0,2.0,Marquette,Villanova,10.0,12.0,1.0
3,11.0,0.0,2003.0,N,11.0,9.0,2.0,3.0,19.0,20.0,...,23.0,56.0,50.0,14.0,4.0,N Illinois,Winthrop,12.0,19.0,1.0
4,11.0,0.0,2003.0,N,12.0,12.0,4.0,1.0,22.0,15.0,...,14.0,77.0,71.0,4.0,7.0,Texas,Georgia,14.0,10.0,1.0


# Learning
Ok, now we have everything we need. Lets feed the selected input features to a the neural network classifier, and let it learn.

We have to normalize the data, otherwise the features with smaller values will impose a greater weight on the prediction.

In [51]:
from random import random

from IPython.display import SVG
import pygal

from pybrain.structure import SigmoidLayer
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.datasets import ClassificationDataSet
from pybrain.utilities import percentError

from sklearn.preprocessing import StandardScaler

In [52]:
# the features I will feed to the classifier as input data.
input_features = ['Season','score1','score2','fgm1','fga1','fgm31','fga31','ftm1','fta1','or1','dr1','ast1','to1','stl1','blk1','pf1','fgm2','fga2','fgm32','fga32','ftm2','fta2','or2','dr2','ast2','to2','stl2','blk2','pf2']

# the feature giving the result the classifier must learn to predict
output_feature = 'winner'

I defined a normalizer to be able to normalize the data using a simple function rather than a complex one that I would constantly have to google.

In [53]:
def normalize(array):
    scaler = StandardScaler()
    array = scaler.fit_transform(array)

    return scaler, array

I also defined a sample extractor to be able to pull the data out easier when I need it.

In [54]:
def extract_samples(matches, origin_features, result_feature):
    inputs = [tuple(matches.loc[i, feature]
                    for feature in origin_features)
              for i in matches.index]

    outputs = tuple(matches[result_feature].values)

    assert len(inputs) == len(outputs)

    return inputs, outputs

And a basic splitter because I wanted randomized samples whenever I could get them.

In [55]:
def split_samples(inputs, outputs, percent=0.75):
    assert len(inputs) == len(outputs)

    inputs1 = []
    inputs2 = []
    outputs1 = []
    outputs2 = []

    for i, inputs_row in enumerate(inputs):
        if random() < percent:
            input_to = inputs1
            output_to = outputs1
        else:
            input_to = inputs2
            output_to = outputs2

        input_to.append(inputs_row)
        output_to.append(outputs[i])

    return inputs1, outputs1, inputs2, outputs2

In [56]:
inputs, outputs = extract_samples(games_winner,
                                  input_features,
                                  output_feature)

normalizer, inputs = normalize(inputs)

train_inputs, train_outputs, test_inputs, test_outputs = split_samples(inputs, outputs)

n = buildNetwork(len(input_features),
                 10 * len(input_features),
                 10 * len(input_features),
                 1,
                 outclass=SigmoidLayer,
                 bias=True)

To be able to evaluate the results and show progress on the learning cycle, we need these two functions wich help us calculate how well the network can predict the results from the games used to learn, and the games it doesn't know.

In [57]:
def neural_result(input):
    """Call the neural network, and translates its output to a match result."""
    n_output = n.activate(input) 
    if n_output >= 0.5:
        return 2
    else:
        return 1
    
def test_network():
    """Calculate train and test sets errors."""
    print (100 - percentError(map(neural_result, train_inputs), train_outputs), 
           100 - percentError(map(neural_result, test_inputs), test_outputs))

In [58]:
train_set = ClassificationDataSet(len(input_features))

for i, input_line in enumerate(train_inputs):
    train_set.addSample(train_inputs[i], [train_outputs[i] - 1])

trainer = BackpropTrainer(n, dataset=train_set, momentum=0.5, weightdecay=0.0)

train_set.assignClasses()

test_network()

(65.30665816661354, 65.13848559303105)


Train the network, for a given number of iterations. You can re-run this step many times, and it will keep learning but if you train too much you can end up overfitting the training data (this is visible when the test set accuracy starts to decrease).

In [59]:
for i in range(3):
    trainer.train()
    test_network()

(95.76622353889712, 95.23118159481795)
(97.27935799598748, 96.26423944605763)
(98.02748767179794, 96.86173777082868)


In [61]:
for i in range(3):
    trainer.train()
    test_network()

(98.42311514446965, 97.28054500781774)
(98.54686591791199, 97.45365199910654)
(98.88624303901899, 97.66026356935448)


In [62]:
for i in range(3):
    trainer.train()
    test_network()

(99.02499390621192, 97.80545007817734)
(99.31937074606716, 97.97855706946616)
(99.37937112106951, 98.06790261335716)


In [63]:
for i in range(3):
    trainer.train()
    test_network()

(99.47312170701066, 98.17958454322091)
(99.52749704685654, 98.26893008711191)
(99.59124744529653, 98.34710743801652)


The classifier taps out at around 91% accuracy, not bad :)

I pickle all the things just in case I end up losing data

In [64]:
import pickle
pickle.dump(input_features, open('pkl3/input_features.pkl', 'wb'))
pickle.dump(games_winner, open('pkl3/games_winner.pkl', 'wb'))
pickle.dump(train_inputs, open('pkl3/train_inputs.pkl', 'wb'))
pickle.dump(train_outputs, open('pkl3/train_outputs.pkl', 'wb'))
pickle.dump(normalizer, open('pkl3/normalizer.pkl', 'wb'))
pickle.dump(inputs, open('pkl3/inputs.pkl', 'wb'))
pickle.dump(outputs, open('pkl3/outputs.pkl', 'wb'))
pickle.dump(n, open('pkl3/n.pkl', 'wb'))
pickle.dump(team_stats, open('pkl3/team_stats.pkl', 'wb'))

In [44]:
import pickle
input_features = pickle.load(open('pkl3/input_features.pkl', 'rb'))
n = pickle.load(open('pkl3/n.pkl', 'rb'))
outputs = pickle.load(open('pkl3/outputs.pkl', 'rb'))
train_outputs = pickle.load(open('pkl3/train_outputs.pkl', 'rb'))
inputs = pickle.load(open('pkl3/inputs.pkl', 'rb'))
normalizer = pickle.load(open('pkl3/normalizer.pkl', 'rb'))
train_inputs = pickle.load(open('pkl3/train_inputs.pkl', 'rb'))
games_winner = pickle.load(open('pkl3/games_winner.pkl', 'rb'))
team_stats = pickle.load(open('pkl3/team_stats.pkl', 'rb'))

# Prediction
With the classifier already trained, we can start making predictions. But we need a little function able to translate inputs like this: (2014, 'California', 'Hawaii'), to the numeric inputs the classifier expects (based on the input features).

This method gets the average stats of two teams when they played each other in a certain year that they played.

In [65]:
def get_last_games_avg_both(team1, team2, season):
    team_1 = games_winner[(games_winner['team1'] == team1)&(games_winner['team2'] == team2)&(games_winner['Season'] == season)]
    team_2 = games_winner[(games_winner['team1'] == team2)&(games_winner['team2'] == team1)&(games_winner['Season'] == season)]
#     team_2 = games_winner[(games_winner['team2'] == team)&(games_winner['Season'] == season)][[x for x in games_winner.columns.values if '2' in x]+['Daynum', 'Season']]
    
    team_1_first = team_1[(team_1['team1'] == team1)&(team_1['Season'] == season)][[x for x in team_1.columns.values if '1' in x]+['Daynum', 'Season']]
    team_2_first = team_1[(team_1['team2'] == team2)&(team_1['Season'] == season)][[x for x in team_1.columns.values if '2' in x]+['Daynum', 'Season']]
    
    team_1_second = team_2[(team_2['team1'] == team1)&(team_2['Season'] == season)][[x for x in team_2.columns.values if '1' in x]+['Daynum', 'Season']]
    team_2_second = team_2[(team_2['team2'] == team2)&(team_2['Season'] == season)][[x for x in team_2.columns.values if '2' in x]+['Daynum', 'Season']]
    
    team_1_first.columns = team_1_first.columns.map(lambda x: x.strip('1'))
    team_2_first.columns = team_2_first.columns.map(lambda x: x.strip('2'))
    
    team_1_second.columns = team_1_second.columns.map(lambda x: x.strip('1'))
    team_2_second.columns = team_2_second.columns.map(lambda x: x.strip('2'))

    games = pd.concat([team_1_first, team_2_first, team_1_second, team_2_second])
    games = games.sort_values('Daynum')
    del games['Daynum']
    return games.groupby(['Season','team'])['ast', 'blk', 'dr', 'fga', 'fga3', 'fgm', 'fgm3', 'fta', 'ftm', 'or', 'pf', 'score', 'stl', 'to'].mean().reset_index()

This method gets the last n games that a team has played in a certain year

In [66]:
def get_last_n_games_avg(n, team, season):
    team_1 = games_winner[(games_winner['team1'] == team)&(games_winner['Season'] == season)][[x for x in games_winner.columns.values if '1' in x]+['Daynum', 'Season']]
    team_2 = games_winner[(games_winner['team2'] == team)&(games_winner['Season'] == season)][[x for x in games_winner.columns.values if '2' in x]+['Daynum', 'Season']]

    team_1.columns = team_1.columns.map(lambda x: x.strip('1'))
    team_2.columns = team_2.columns.map(lambda x: x.strip('2'))

    games = pd.concat([team_1, team_2])
    games = games.sort_values('Daynum')
    del games['Daynum']
    return games.groupby(['Season','team'])['ast', 'blk', 'dr', 'fga', 'fga3', 'fgm', 'fgm3', 'fta', 'ftm', 'or', 'pf', 'score', 'stl', 'to'].mean().reset_index()

This function does the conversion, also normalizes the data with the same normalizer used before, and then just asks the classifier for the prediction.

In [67]:
def neural_result(input):
    """Call the neural network, and translates its output to a match result."""
    n_output = n.activate(input) 
    if n_output >= 0.5:
        return 2
    else:
        return 1

def predict(year, team1, team2):
    inputs = []
    diff_year_1 = ''
    diff_year_2 = ''
    n_games = 20
    if year in range(2003, 2017):
        if len(get_last_games_avg_both(team1, team2, year)) != 0:
            team_stats = get_last_games_avg_both(team1, team2, year)
            print 'From avg both'
        else:
            team_stats = pd.concat([get_last_n_games_avg(n_games, team1, year), get_last_n_games_avg(n_games, team2, year)])
            print 'From avg n games 1'
    elif year < 2003:
        if len(get_last_games_avg_both(team1, team2, 2003)) != 0:
            team_stats = get_last_games_avg_both(team1, team2, year)
            print 'From avg both'
        else:
            team_stats = pd.concat([get_last_n_games_avg(n_games, team1, 2003), get_last_n_games_avg(n_games, team2, 2003)])
            print 'From avg n games 2'
    else:
        if len(get_last_games_avg_both(team1, team2, 2016)) != 0:
            team_stats = get_last_games_avg_both(team1, team2, year)
            print 'From avg both'
        else:
            team_stats = pd.concat([get_last_n_games_avg(n_games, team1, 2016), get_last_n_games_avg(n_games, team2, 2016)])
            print 'From avg n games 3'
    for feature in input_features:
        from_team_2 = '2' in feature
        feature = feature.replace('2', '')
        feature = feature.replace('1', '')
        if feature in [x for x in team_stats.columns.values if x != 'Season']:
            team = team2 if from_team_2 else team1
            try:
                value = team_stats[(team_stats.team == team)&(team_stats.Season == year)].iloc[[0]][feature].values[0]
            except:
                if from_team_2:
                    diff_year_2 = team
                else:
                    diff_year_1 = team
                print team_stats.columns
                print feature
                print team_stats[team_stats['team'] == team]
                value = team_stats[team_stats['team'] == team].iloc[[-1]][feature].values[0]
        elif feature == 'Season':
            value = year
        else:
            raise ValueError("Don't know where to get feature: " + feature)
        inputs.append(value)

    inputs = normalizer.transform(np.array(inputs).reshape((1, -1)))
    result = neural_result(inputs[0])

    results = ''
    if diff_year_1 != '':
        year_used = team_stats[team_stats['team'] == team1].iloc[[-1]]['Season'].values[0]
        results += "Couldn't find data from "+str(year)+" for team1 = "+diff_year_1+", used "+str(int(year_used))+" instead.\n"
    if diff_year_2 != '':
        year_used = team_stats[team_stats['team'] == team2].iloc[[-1]]['Season'].values[0]
        results += "Couldn't find data from "+str(year)+" for team1 = "+diff_year_2+", used "+str(int(year_used))+" instead.\n"
    
    if results:
        print results
    
    if result == 1:
        return team1
    elif result == 2:
        return team2
    else:
        return 'Unknown result: ' + str(result)

In [68]:
# get_last_n_games_avg(20, 'Tennessee', 2016)
len(games_winner)

71241

In [69]:
pd.concat([get_last_n_games_avg(20, 'Michigan St', 2016), get_last_n_games_avg(20, 'Tennessee', 2016)])

Unnamed: 0,Season,team,ast,blk,dr,fga,fga3,fgm,fgm3,fta,ftm,or,pf,score,stl,to
0,2016.0,Michigan St,20.558824,5.147059,29.558824,59.529412,21.029412,28.794118,9.117647,17.882353,13.058824,12.294118,19.088235,79.764706,4.411765,11.794118
0,2016.0,Tennessee,13.470588,4.588235,25.411765,62.235294,22.558824,26.176471,7.382353,21.205882,15.382353,12.588235,19.470588,75.117647,5.294118,11.176471


In [70]:
# Another upset
predict(2016, 'Michigan St', 'Tennessee') #Wrong

From avg n games 1


'Michigan St'

Some predictions about the past, compared to real results:
Even while we know those results and some of them we're used to train the classifier, that doesn't guarantee the real result is what the classifier will predict.

In [71]:
# 1st seed agaisnt 16th seed
predict(2016, 'Kansas', 'Austin Peay') #Correct

From avg n games 1


'Kansas'

In [72]:
# 1st seed agaisnt 16th seed
predict(2016, 'North Carolina', 'FL Gulf Coast') #Correct

From avg n games 1


'North Carolina'

In [73]:
# 1st seed agaisnt 16th seed
predict(2016, 'Oregon', 'Holy Cross') #Correct

From avg n games 1


'Oregon'

In [74]:
# 1st seed agaisnt 16th seed
predict(2016, 'Virginia', 'Hampton') #Correct

From avg n games 1


'Hampton'

Lets follow kansas through the tournament

In [75]:
predict(2016, 'Kansas', 'Connecticut') #Correct

From avg n games 1


'Kansas'

In [76]:
predict(2016, 'Kansas', 'Maryland') #Correct

From avg n games 1


'Kansas'

In [77]:
predict(2016, 'Kansas', 'Villanova') #Wrong

From avg n games 1


'Kansas'

Lets follow Villanova

In [78]:
predict(2016, 'Villanova', 'Oklahoma') #Wrong

From avg both


'Oklahoma'

In [79]:
predict(2016, 'Villanova', 'North Carolina') #Correct

From avg n games 1


'North Carolina'

In [93]:
# What about a huge upset?
predict(2016, 'SF Austin', 'West Virginia') #Wrong

'West Virginia'

In [105]:
# Another upset
predict(2016, 'Michigan St', 'Tennessee') #Wrong

From avg n games
Index([u'Season', u'team', u'ast', u'blk', u'dr', u'fga', u'fga3', u'fgm',
       u'fgm3', u'fta', u'ftm', u'or', u'pf', u'score', u'stl', u'to'],
      dtype='object')
score


IndexError: positional indexers are out-of-bounds

In [95]:
# Another upset
predict(2016, 'Purdue', 'Ark Little Rock') #Wrong

'Purdue'

In [96]:
# Another upset
predict(2016, 'California', 'Hawaii') #Correct

'California'

## Some predictions about the future/past:
Future and past prediction will not work and the predict method will use data from the closest year where there is data available instead.

In [97]:
predict(1984, 'California', 'Hawaii')

IndexError: positional indexers are out-of-bounds

In [106]:
predict(2017, 'California', 'Hawaii')

Couldn't find data from 2017 for team1 = California, used 2016 instead.
Couldn't find data from 2017 for team1 = Hawaii, used 2016 instead.



'Hawaii'