In [13]:
import csv
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Concatenate
from keras.regularizers import l1_l2
from keras.optimizers import Adam
from keras.losses import binary_crossentropy
from keras import backend as K
from keras.utils.vis_utils import plot_model
from sklearn.cross_validation import StratifiedKFold
import csv
import pydot
import graphviz

Here, we clean up the data. We create 2 dataframes, one for winners and one for losers. Each has the same column names (team, score, fgm, fga, fgm3, fga3, ftm, fta, or, dr, ast, stl, blk, pf).
<br> We will now use these dataframes to build our representative team vectors for the season, putting together the information from the winners and the losers.

In [14]:
all_attributes = ['team', 'score', 'fgm', 'fga', 'fgm3', 'fga3', 'ftm', 'fta', 'or', 'dr', 'ast', 'stl', 'blk', 'pf']
imp_attributes = ['team', 'score', 'fgm', 'fga', 'fgm3', 'fga3', 'ftm', 'fta', 'or', 'dr', 'ast', 'stl', 'blk']

def load_team_data():
    reg_season_data_filename = "RegularSeasonDetailedResults.csv"
    #We load the data into a pandas dataframe.
    reg_season_data = pd.read_csv(reg_season_data_filename)
    # reg_season_data = reg_season_data.loc[lambda df: df.Season == 2003]
    #Took out location and overtimes. Play around with what other stats we want to include. 
    
    w = reg_season_data[['Season'] + ['W' + x for x in imp_attributes]]
    win_remap = {'W'+x: x for x in imp_attributes}
    winners = w.rename(index = str, columns = win_remap)
    los_remap = {'L' + x:x for x in imp_attributes}
    l = reg_season_data[['Season']+ ['L' + x for x in imp_attributes]]
    losers = l.rename(index=str, columns = los_remap)
    
    #Know what seasons of data we are dealing with
    reg_season_years = reg_season_data['Season'].unique()

    #Here, we put all the data (from both winners and losers) into one dataframe. 
    team_data = pd.concat([winners, losers])

    by_reg_season = {}
    for year in reg_season_years:
        s = team_data[team_data['Season']==year]
    #     display(s)
        s = s.drop(['Season'], axis = 1)
        by_reg_season[year] = s
    return by_reg_season, reg_season_years


In [15]:
by_reg_season, reg_season_years = load_team_data()
# display(by_reg_season)

We denominate the team vector to be represented as:
\[Score, FG Made, FG Attempted, 3 Pointers Made, 3 Pointers attempted, FT made, FT Attempted, Off Rebounds, Def Rebounds, Assists, Steals, Blocks\]. (Later in the code we add the team's seed to their vectors)
<br> <br>
In this following cell we aggregate the data and calculate each team's average statistics for each year. To store this, we keep 1 dataframe (indexed by team) for each year of data. The pointers to these objects are stored in our dictionary, which serves as a database.

In [16]:
def get_team_vector_store(by_reg_season, reg_season_years):
    # Create dictionary mapping year-> Dataframe of teams within that year.
    team_vector_store = {}

    for year in reg_season_years:
        t = by_reg_season[year].groupby('team')[imp_attributes[1:]].mean()
        team_vector_store[year] = t.apply(lambda x: x, axis = 0)
    
    return team_vector_store


In [17]:
team_vector_store = get_team_vector_store(by_reg_season, reg_season_years)

We define a function to normalize the data. This could be used at whatever point we want. If we want a standardn normal distribution, set standard = True, otherwise we let standard=False for min/max normalization.

We test using both methods of normalization to see which gave us better results.

In [18]:
def normalize_data(team_vector_store, standard = False):
    """
    Function to normalize the data. 
    IMPORTANT: Modifies the argument passed in.
    Set standard to True if normalizing to standard Gaussian,
    False for min/max normalization. 
    """
    for y in reg_season_years:
        if standard:
            team_vector_store[y] = team_vector_store[y].apply(lambda x: (x - x.mean())/x.std(), axis = 0)
        else:
            team_vector_store[y] = team_vector_store[y].apply(lambda x: x/x.max(), axis = 0)

We define a matchup to be the difference between the vectors of team1 and team2. If team 1 wins, we classify the matchup as a 1, while we classify it as a 0 if team2 wins. Below is a quick example of how it could work. However, we must now add each team's seed to their representative vector.

In [19]:
year = 2017
m = team_vector_store[year]
# display(m.loc[1274])
# display(m.loc[1199])
matchup_1 = (m.loc[1274] - m.loc[1199])
matchup_1['class'] = 1
# display(matchup_1)

Below, we modify each season's team vector dataframe by adding each team's NCAA tournament seed. This will allow us to use this as a feature when outputting predictions.

In [20]:
def load_seed_data(filename):
    tourney_seeds_data = pd.read_csv(filename)
    tourney_seeds_data = tourney_seeds_data.query('Season >= 2003')
    tourney_seeds_data = tourney_seeds_data.reset_index(drop=True)
    tourney_seeds_data.rename(str.lower, axis='columns', inplace = True)
    tourney_seeds_data['seed'] = tourney_seeds_data['seed'].str.extract('(\d+)', expand = False).astype(int)
    return tourney_seeds_data

In [21]:
tourney_seeds_data = load_seed_data("TourneySeeds.csv")

In [22]:
def update_vector_store(team_vector_store, tourney_seeds_data):
    #Joins the team vector store with the seed information.
    #Allows us to consider seed as an attribute of the team.
    #Eliminates the teams not in the tournament.
    for s in reg_season_years:
        relevant_data = tourney_seeds_data.query('season == {}'.format(s))
        relevant_data = relevant_data.drop('season', axis = 1)
        m = team_vector_store[s]
        m = m.merge(relevant_data, how = 'outer', left_index = True, right_on = 'team')
        m.set_index('team', inplace=True)
        m = m[pd.notnull(m['seed'])]
        team_vector_store[s] = m
    return team_vector_store

In [23]:
team_vector_store = update_vector_store(team_vector_store, tourney_seeds_data)

In [24]:
normalize_data(team_vector_store, standard = False)

Now, we must use our team vectors along with the matchups each season to build our feature vectors, which we will use for classification. A 1 will signify that the 1st team won, while a 0 means the 2nd team won.

Our NCAA tournament data is in the same format as the regular season one. In order to have a combination of 1s and 0s, we will generate 2 different vectors for each matchup, where the first has team_1 = winning_team while the second has team_1 = losing_team. Because of the way our data is labeled, if we do not do this then all of our vectors will have classification 1 or 0, which won't allow our model to actually learn a decision boundary.

In [25]:
def generate_training_data(team_vector_store, classification = True):
    """
    Function to generate the training/testing data.
    Based on team_vector_store and the tourney detailed results.
    If classification = True, generates for binary classification.
    If classification = False, generates for regression.
    
    Returns training_data
    """
    tourney_game_data = pd.read_csv("TourneyDetailedResults.csv")
    tourney_game_data = tourney_game_data[['Season', 'Wteam', 'Lteam', 'Wscore', 'Lscore']]


    col_names = list(team_vector_store[2003]) + ['class']

    train_matchup_data = pd.DataFrame(columns = col_names)

    for index, row in tourney_game_data.iterrows():
        season, wteam, lteam, wscore, lscore = row
        
        #Winner is team 1, loser is team 2
        game_vector_1 = team_vector_store[season].loc[wteam] - team_vector_store[season].loc[lteam]
        if classification:
            game_vector_1['class'] = 1
        else:
            game_vector_1['class'] = wscore - lscore
        
        #Loser is team 1, winner is team 2
        game_vector_2 = team_vector_store[season].loc[lteam] - team_vector_store[season].loc[wteam]
        if classification:
            game_vector_2['class'] = 0
        else:
            game_vector_2['class'] = lscore - wscore
        
        train_matchup_data = train_matchup_data.append([game_vector_1, game_vector_2], ignore_index = True)
        
    return train_matchup_data

In [26]:
train_matchup_data_class = generate_training_data(team_vector_store, classification = True)
train_matchup_data_reg = generate_training_data(team_vector_store, classification = False)

We now have our training data built up. The X data will just be the rows except ('class'), while the Y data will simply be the column 'class'.

Below, we define functions to build 2 different kinds of simple neural networks:
1. A binary classification network: 
    - Takes matchup vectors (team_1 - team_2) as inputs, and outputs a binary digit prediction (1 if team_1 wins, 0 if team_2 wins).
2. A regression network: 
    - Takes matchup vectors (team_1 - team_2) as inputs, and outputs a scalar prediction for the difference between scores (team_1_score - team_2_score).
    - We then use these predicted scores to predict the matchup victor (1 if predicted score > 0, and 0 otherwise)
    
In order to evaluate the regression network's predictions, we define a prediction_percentage function which returns the percentage of games in which the victor is predicted correctly.

In [27]:
def pred_percentage(y_true, y_pred):
    y_true_sign = tf.sign(y_true)
    y_pred_sign = tf.sign(y_pred)
    mults = tf.multiply(y_true_sign, y_pred_sign)
    return K.mean(mults+1)/2

def build_keras_model(input_shape, num_units, num_layers, classification = True):
    model = Sequential()
    
    model.add(Dense(units = num_units, 
                    input_dim = input_shape[1], 
                    activation = 'relu',
                    kernel_regularizer = l1_l2(0, 0.001)))
    
    model.add(Dropout(0.25))
    
    for i in range(num_layers - 1):
        model.add(Dense(num_units, 
                        activation = 'relu',
                        kernel_regularizer = l1_l2(0, 0.001)))
#         model.add(Dropout(0.25))
    
    if classification:
        model.add(Dense(1, activation = 'sigmoid'))
        model.compile(loss = 'binary_crossentropy', optimizer = Adam(), metrics = ['accuracy'])
    else:
        model.add(Dense(1))
        model.compile(loss = 'mae', optimizer = Adam(), metrics = [pred_percentage])
    return model

def train_and_evaluate(model, X_train, Y_train, X_test, Y_test, epochs):
    model.fit(X_train, Y_train, epochs = epochs, batch_size = 1)
    loss = model.evaluate(X_test, Y_test, batch_size = 1)
    print("Loss was: {} and accuracy was: {}".format(loss[0], loss[1]))
    return loss[1]

Below, we use cross-validation (with 10 folds) to test the performance of our binary classification neural network. This process splits the data into 10 randomized chunks. Then, it uses $\frac{9}{10}$ chunks to train the network, while using the final chunk to test the predictions. After this, the average accuracy is printed below, and serves as a proxy for the performance of this neural network on unknown data.

In [28]:
X_train_class = train_matchup_data_class.drop(['class'], axis = 1)
input_shape = X_train_class.shape
Y_train_class = train_matchup_data_class['class']

num_units = 2*len(imp_attributes)
num_layers = 3

n_folds = 10
epochs = 5
folds = StratifiedKFold(Y_train_class, n_folds = n_folds, shuffle = True)


accuracies = []
for i, (train, test) in enumerate(folds):
    print("Running fold: {}".format(i+1))
    model = None
    model = build_keras_model(input_shape, num_units, num_layers, classification = True)
    accuracy = train_and_evaluate(model, X_train_class.iloc[train], 
                       Y_train_class.iloc[train], 
                       X_train_class.iloc[test], 
                       Y_train_class.iloc[test], 
                       epochs = epochs)
    accuracies.append(accuracy)

print("The average accuracy of predictions was: {}".format(np.mean(accuracies)))

Running fold: 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 0.5936589991777308 and accuracy was: 0.6458333333333334
Running fold: 2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 0.5678695892501209 and accuracy was: 0.7083333333333334
Running fold: 3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 0.5718523275831507 and accuracy was: 0.7291666666666666
Running fold: 4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 0.6356644554352257 and accuracy was: 0.6549295774647887
Running fold: 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 0.5737729076243622 and accuracy was: 0.7323943661971831
Running fold: 6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 0.5473388294413896 and accuracy was: 0.7676056338028169
Running fold: 7
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 0.6110743690596919 and accuracy was: 0.7112676056338029
Running fold: 8
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 

Now, we test the performance of our regression neural network. As above, we use cross validation (with 10 folds) in order to test our network on our given dataset. 

In [29]:
X_train_reg = train_matchup_data_reg.drop(['class'], axis = 1)
input_shape = X_train_reg.shape
Y_train_reg = train_matchup_data_reg['class']

num_units = 2*len(imp_attributes)
num_layers = 3

n_folds = 10
epochs = 5
folds = StratifiedKFold(Y_train_reg, n_folds = n_folds, shuffle = True)


accuracies = []
for i, (train, test) in enumerate(folds):
    print("Running fold: {}".format(i+1))
    model = None
    model = build_keras_model(input_shape, num_units, num_layers, classification = False)
    accuracy = train_and_evaluate(model, X_train_reg.iloc[train], 
                       Y_train_reg.iloc[train], 
                       X_train_reg.iloc[test], 
                       Y_train_reg.iloc[test], 
                       epochs = epochs)
    accuracies.append(accuracy)

print("The average accuracy of predictions was: {}".format(np.mean(accuracies)))

Running fold: 1




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 9.243078572357573 and accuracy was: 0.6951219512195121
Running fold: 2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 10.020169482299476 and accuracy was: 0.6369426751592356
Running fold: 3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 8.797356285988108 and accuracy was: 0.7631578947368421
Running fold: 4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 8.431226476523522 and accuracy was: 0.7027027027027027
Running fold: 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 8.294162594936264 and accuracy was: 0.6971830985915493
Running fold: 6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 8.302843140727944 and accuracy was: 0.7407407407407407
Running fold: 7
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 9.539671704840304 and accuracy was: 0.7014925373134329
Running fold: 8
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 9.05455408863319 and a

The final neural network we will test is a bit more complex. Instead of taking a matchup vector as input, this neural network is simply provided the two team vectors. This allows the neural network to learn its own definition for a "matchup", instead of having to use ours. As with model #2, this is a regression network, which means its predictions represent the difference between the scores of the teams. 

In order to build this, we have to build our training set in a different way: building two different training sets in conjunction (one for team_1s and the other for team_2s), and then the labels in a third pandas series.

In [30]:
def gen_new_data(team_vector_store):
    """
    Generates data sets for inputs into the third neural network.
    """
    tourney_game_data = pd.read_csv("TourneyDetailedResults.csv")
    tourney_game_data = tourney_game_data[['Season', 'Wteam', 'Lteam', 'Wscore', 'Lscore']]

    # display(tourney_game_data)

    col_names = list(team_vector_store[2003])
    # print(list(col_names))

    team_ones = pd.DataFrame(columns = col_names)
    team_twos = pd.DataFrame(columns = col_names)

    one_minus_two = []
    
    for index, row in tourney_game_data.iterrows():
        season, wteam, lteam, wscore, lscore = row
        
        win_vector = team_vector_store[season].loc[wteam]
        lose_vector = team_vector_store[season].loc[lteam]
        
        #Win minus lose
        team_ones = team_ones.append(win_vector)
        team_twos = team_twos.append(lose_vector)
        one_minus_two.append(wscore-lscore)
    
        #Lose minus win
        team_ones = team_ones.append(lose_vector)
        team_twos = team_twos.append(win_vector)
        one_minus_two.append(lscore-wscore)
    
    one_minus_two = pd.Series(one_minus_two)
    return team_ones, team_twos, one_minus_two


In [31]:
new_team_ones, new_team_twos, one_minus_two = gen_new_data(team_vector_store)

# display(new_team_ones)

Below, we define a function to create the third model. This model has 3 hidden layers, and 15 units (1 for each feature in the feature vector) on each layer.

In [32]:
def make_reg_model(team_feature_length):
    
    t1 = Input(shape = (team_feature_length, ))
    t2 = Input(shape = (team_feature_length, ))
    
    team_transform = Dense(team_feature_length, 
                            activation = 'tanh', 
                            kernel_regularizer = l1_l2(0, 0.001)
                          )
    
    team_1 = team_transform(t1)
    team_2 = team_transform(t2)
    
    merge = Concatenate(axis=-1)([team_1, team_2])
    
    dropout = Dropout(0.2)(merge)
    
    x = Dense(2*team_feature_length,
                activation = 'tanh',
                kernel_regularizer = l1_l2(0, 0.001)
             )(dropout)
    
    x = Dense(2*team_feature_length,
                activation = 'tanh',
                kernel_regularizer = l1_l2(0, 0.001)
             )(x)
    
    x = Dense(2*team_feature_length,
                activation = 'tanh',
                kernel_regularizer = l1_l2(0, 0.001)
             )(x)
    
    pred = Dense(1)(x) #linear activation for regression
    model = Model(inputs = [t1, t2],
                 outputs = pred)
    model.compile(optimizer = 'Adam', loss = 'mae', metrics = [pred_percentage])
    
    return model

def new_train_and_evaluate(model, X_train, Y_train, X_test, Y_test, epochs):
    model.fit(X_train, Y_train, epochs = epochs, batch_size = 1)
    loss = model.evaluate(X_test, Y_test, batch_size = 1)
    print("Loss was: {} and prediction percentage was: {}".format(loss[0], loss[1]))

Below, we test this third neural network using the same process as above.

In [33]:
# display(one_minus_two)
team_feature_length = len(imp_attributes)

n_folds = 10
epochs = 5
folds = StratifiedKFold(one_minus_two, n_folds = n_folds, shuffle = True)
# folds = StratifiedKFold(n_folds = n_folds, shuffle=True)

percentages = []
for i, (train, test) in enumerate(folds):
    print("Running fold: {}".format(i+1))
    model = None
    model = make_reg_model(team_feature_length)
    percentage = train_and_evaluate(model,
                       [new_team_ones.iloc[train], new_team_twos.iloc[train]],
                       one_minus_two.iloc[train], 
                       [new_team_ones.iloc[test], new_team_twos.iloc[test]],
                       one_minus_two.iloc[test], 
                       epochs = epochs)
    percentages.append(percentage)

print("Average prediction accuracy: {}".format(np.mean(percentages)))



Running fold: 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 9.093047359829429 and accuracy was: 0.6666666666666666
Running fold: 2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 8.597993508711749 and accuracy was: 0.7189542483660131
Running fold: 3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 10.103317923183683 and accuracy was: 0.7215189873417721
Running fold: 4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 9.613758764966251 and accuracy was: 0.7777777777777778
Running fold: 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 9.161805195616676 and accuracy was: 0.6993006993006993
Running fold: 6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 9.000069025224143 and accuracy was: 0.6861313868613139
Running fold: 7
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 10.051630956521869 and accuracy was: 0.656934306569343
Running fold: 8
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss was: 8.5983

We have now created and tested 3 different models:
1. Simple binary classifier. 
    - Uses our definition of matchups (team_1 - team_2), and outputs 1 if team_1 wins, 0 if team_2 wins.

2. Simple Regression.
    - Uses our definition of matchups, and tries to predict the difference between points of team_1 and team_2.
    - We then use a scoring function (return 1 if score_diff > 0, and 0 otherwise) in order to generate predictions.

3. More Complex Regression.
    - We feed just the team vectors into the model, and let the model learn an effective definition for what a matchup is.
    - The model then outputs a prediction of the score differential (team_1_score - team_2_score).
    - We then use the scoring function defined above to generate predictions.
    
    

Now we will use these models (and the training data) in order to generate predictions for the matchups between 2011-2013.

In [34]:
def generate_matchup_testing_data(team_vector_store):
    """
    Function to generate the training/testing data.
    Based on team_vector_store and the tourney detailed results.
    If classification = True, generates for binary classification.
    If classification = False, generates for regression.
    
    Returns training_data
    """
    col_names = list(team_vector_store[2003])

    testing_matchup_data = pd.DataFrame(columns = col_names)
    matchups = []
    
    for year in [2011, 2012, 2013]:
        season_data = team_vector_store[year].sort_index()
        for i1, row1 in season_data.iterrows():
            for i2, row2 in season_data.iterrows():
#                 print(row2)
                if i2 > i1:
                    matchups.append([year, i1, i2])
                    matchup = row1 - row2
                    testing_matchup_data = testing_matchup_data.append([matchup], ignore_index = True)

    return testing_matchup_data, matchups

In [35]:
matchup_data, matchups = generate_matchup_testing_data(team_vector_store)

In [36]:
def generate_predictions_to_csv(model, X_train, Y_train, outfile, testing_data, matchups, classification = True):
    model.fit(X_train, Y_train, epochs = 5, batch_size = 1)
    if classification:
        predictions = model.predict_classes(testing_data)
    else:
        predictions = model.predict(testing_data)
        
    result = []
    for i in range(len(matchups)):
        representation = '_'.join([str(matchups[i][0]), str(matchups[i][1]), str(matchups[i][2])])
        if classification:
            representation = [representation] + [str(predictions[i][0])]
        else:
            pred = 1 if predictions[i][0] > 0 else 0
            representation = [representation] + [str(pred)]
        result.append(representation)

    # print(result)
    result = pd.DataFrame(data = result, columns = ['game_ID', 'Prediction'])
    result.to_csv(outfile, index = False, sep = ',')

In [37]:
X_train_class = train_matchup_data_class.drop(['class'], axis = 1)
input_shape = X_train_class.shape
Y_train_class = train_matchup_data_class['class']
binary_class_model = build_keras_model(input_shape, 
                                       num_units = 2*len(imp_attributes), 
                                       num_layers = 3, 
                                       classification = True)

# plot_model(binary_class_model, to_file = "binary_class_model.png")
print(binary_class_model.summary())

generate_predictions_to_csv(binary_class_model, 
                            X_train_class, 
                            Y_train_class, 
                            "bin_predictions.csv",
                            matchup_data,
                            matchups,
                            classification = True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_131 (Dense)            (None, 26)                364       
_________________________________________________________________
dropout_31 (Dropout)         (None, 26)                0         
_________________________________________________________________
dense_132 (Dense)            (None, 26)                702       
_________________________________________________________________
dense_133 (Dense)            (None, 26)                702       
_________________________________________________________________
dense_134 (Dense)            (None, 1)                 27        
Total params: 1,795
Trainable params: 1,795
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
X_train_reg = train_matchup_data_reg.drop(['class'], axis = 1)
input_shape = X_train_reg.shape
Y_train_reg = train_matchup_data_reg['class']

simple_regression_model = build_keras_model(input_shape, 
                                            num_units = 2*len(imp_attributes), 
                                            num_layers = 3, 
                                            classification = False)
print(simple_regression_model.summary())



generate_predictions_to_csv(simple_regression_model, 
                            X_train_reg, 
                            Y_train_reg, 
                            "simple_reg_predictions.csv",
                            matchup_data,
                            matchups,
                            classification = False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_135 (Dense)            (None, 26)                364       
_________________________________________________________________
dropout_32 (Dropout)         (None, 26)                0         
_________________________________________________________________
dense_136 (Dense)            (None, 26)                702       
_________________________________________________________________
dense_137 (Dense)            (None, 26)                702       
_________________________________________________________________
dense_138 (Dense)            (None, 1)                 27        
Total params: 1,795
Trainable params: 1,795
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
def generate_indiv_testing_data(team_vector_store):
    """
    Function to generate the testing data.
    """
    col_names = list(team_vector_store[2003])

    team_ones = pd.DataFrame(columns = col_names)
    team_twos = pd.DataFrame(columns = col_names)
    matchups = []
    
    for year in [2011, 2012, 2013]:
        season_data = team_vector_store[year].sort_index()
        for i1, row1 in season_data.iterrows():
            for i2, row2 in season_data.iterrows():
#                 print(row2)
                if i2 > i1:
                    team_ones = team_ones.append(row1, ignore_index = True)
                    team_twos = team_twos.append(row2, ignore_index = True)
                    matchups.append([year, i1, i2])
                    
    return [team_ones, team_twos], matchups

In [40]:
X_train = [new_team_ones, new_team_twos]
team_dim = len(imp_attributes)
Y_train = one_minus_two

complex_regression_model = make_reg_model(team_dim)
indiv_testing_data, matchups = generate_indiv_testing_data(team_vector_store)

print(complex_regression_model.summary())


generate_predictions_to_csv(complex_regression_model,
                           X_train,
                           Y_train,
                           "complex_reg_predictions.csv",
                           indiv_testing_data,
                           matchups,
                           classification = False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 13)           0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           (None, 13)           0                                            
__________________________________________________________________________________________________
dense_139 (Dense)               (None, 13)           182         input_21[0][0]                   
                                                                 input_22[0][0]                   
__________________________________________________________________________________________________
concatenate_11 (Concatenate)    (None, 26)           0           dense_139[0][0]                  
          

In [41]:
print(reg_season_years)
year = 2017
m = team_vector_store[year]
# display(m.loc[1274])
# display(m.loc[1199])
matchup_1 = (m.loc[1274] - m.loc[1199])
matchup_1['class'] = 1
# display(matchup_1)

[2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
 2017]
