In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LassoCV, ElasticNetCV, Lasso, LinearRegression
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_files = [file for file in os.listdir() if file.endswith('_data.csv') and '2016' not in file]
data_frames = [pd.read_csv(data_file) for data_file in data_files]
final_data_frames = [pd.concat([data_frame.drop(["Seed"], axis = 1), pd.get_dummies(data_frame.Seed)], axis = 1)\
                     for data_frame in data_frames]

In [3]:
years = [i for i in range(2001, 2016)]
test_year = 2004
all_training_data = None
training_data_list = [data for year, data in zip(years, final_data_frames) if year != test_year]
test_df = final_data_frames[years.index(test_year)]
training_df = pd.concat(training_data_list)
train_X = training_df.drop(['Wins', 'Name'], axis = 1)
train_y = training_df.Wins
test_X = test_df.drop(['Wins', 'Name'], axis = 1)
test_y = test_df.Wins

In [4]:
num_folds = 5
def score_by_cross_validation(model):
    folds = KFold(len(train_X), n_folds = num_folds)
    model_score = 0
    for train, test in folds:
        current_fold_train_X = train_X.iloc[list(train), :]
        current_fold_train_y = train_y.iloc[list(train)]
        current_fold_test_X = train_X.iloc[list(test), :]
        current_fold_test_y = train_y.iloc[list(test)]
        model.fit(current_fold_train_X, current_fold_train_y)
        print(model.score(current_fold_train_X, current_fold_train_y))
        print(model.score(current_fold_test_X, current_fold_test_y))
        model_score += model.score(current_fold_test_X, current_fold_test_y)
    model_score /= num_folds
    print(model_score)
    return model_score

In [5]:
score_by_cross_validation(LinearRegression())

0.605140766246
0.484446489597
0.610773169509
0.481465636901
0.584007675149
-2046.36100442
0.597706773993
0.523245506102
0.596333458036
0.549253964125
-408.864518565


-408.86451856528964

In [7]:
score_by_cross_validation(LassoCV(max_iter = 10000))

  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':


0.543515859115
0.431871041631
0.528128770896
0.451567899385
0.524394195747
0.44402947792
0.522257482267

  if precompute == 'auto':
  if precompute == 'auto':



0.486682157513
0.504405339958
0.500399524823
0.462910020255


0.46291002025456329

In [21]:
score_by_cross_validation(LassoCV(max_iter = 100000, alphas = np.logspace(-4, -0.5, 30)))

  if precompute == 'auto':


0.589112389655
0.452767805046
0.593078318658

  if precompute == 'auto':
  if precompute == 'auto':



0.478570527752
0.575717272738

  if precompute == 'auto':



-6.34376568834
0.585846342001

  if precompute == 'auto':



0.532816207087
0.585223084505
0.543915449768
-0.867139139736


-0.86713913973649481

In [13]:
def check_results(year, model, verbose = 0): 
    years = [i for i in range(2001, 2016)]
    test_year = year
    all_training_data = None
    training_data_list = [data for year, data in zip(years, final_data_frames) if year != test_year]
    test_df = final_data_frames[years.index(test_year)]
    training_df = pd.concat(training_data_list)
    train_X = training_df.drop(['Wins', 'Name'], axis = 1)
    train_y = training_df.Wins
    test_X = test_df.drop(['Wins', 'Name'], axis = 1)
    test_y = test_df.Wins
    model.fit(train_X, train_y)
    if verbose > 0:
        print(model.score(train_X, train_y))
    mapping = {list(test_df.Name)[i] : list(model.predict(test_X))[i] for i in range(len(list(test_df.Name)))}
    actual_results_name = "tournament-results/%d_real_results.txt" % test_year
    actual_results_file = open(actual_results_name)
    lines = actual_results_file.readlines()
    first_line = lines[0].strip()
    actual_results_file.close()
    rounds = [first_line.split(", ")]
    while len(rounds[-1]) != 1:
        new_list = []
        for i in range(len(rounds[-1])):
            if i % 2 == 0:
                first_score = mapping[rounds[-1][i]]
                second_score = mapping[rounds[-1][i + 1]]
                if first_score > second_score:
                    new_list.append(rounds[-1][i])
                else:
                    new_list.append(rounds[-1][i + 1])
        rounds.append(new_list)
    mistakes = 0
    for i in range(len(rounds[1:])):
        round_mistakes = 0
        predictions = rounds[i + 1]
        actuals = lines[i + 1].strip().split(", ")
        for prediction, actual in zip(predictions, actuals):
            if prediction != actual:
                if verbose > 0:
                    print("Predicted: %s; Actual: %s" % (prediction, actual))
                round_mistakes += 1
        if verbose > 0:
            print("%d: %d" % (2 ** (6 - i), round_mistakes))
        mistakes += round_mistakes
    if verbose > 0:
        print(mistakes)
    if verbose > 1:
        print(np.column_stack([model.predict(test_X),\
                               test_df.Name, test_df.Wins]))
    return mistakes

In [10]:
check_results(2010, LassoCV(max_iter = 10000), verbose = 1)

2010
0.482104032306
Predicted: Utah State; Actual: Texas A&M
Predicted: Texas; Actual: Wake Forest
Predicted: UNLV; Actual: Northern Iowa
64: 3
Predicted: Utah State; Actual: Purdue
Predicted: Kansas; Actual: Northern Iowa
Predicted: Murray State; Actual: Butler
Predicted: BYU; Actual: Kansas State
32: 4
Predicted: Kansas; Actual: Michigan State
Predicted: Ohio State; Actual: Tennessee
Predicted: Syracuse; Actual: Butler
Predicted: BYU; Actual: Kansas State
16: 4
Predicted: Kentucky; Actual: West Virginia
Predicted: Kansas; Actual: Michigan State
Predicted: BYU; Actual: Butler
8: 3
Predicted: Kansas; Actual: Butler
4: 1
2: 0
15


  if precompute == 'auto':


15

In [11]:
check_results(2010, LinearRegression(), verbose = 1)

2010
0.590856334988
Predicted: Utah State; Actual: Texas A&M
Predicted: Richmond; Actual: Saint Mary's (CA)
Predicted: Texas; Actual: Wake Forest
Predicted: Temple; Actual: Cornell
Predicted: Georgetown; Actual: Ohio
Predicted: Vanderbilt; Actual: Murray State
64: 6
Predicted: Richmond; Actual: Saint Mary's (CA)
Predicted: Temple; Actual: Cornell
Predicted: New Mexico; Actual: Washington
Predicted: Kansas; Actual: Northern Iowa
Predicted: BYU; Actual: Kansas State
32: 5
Predicted: Kansas; Actual: Michigan State
Predicted: Syracuse; Actual: Butler
Predicted: BYU; Actual: Kansas State
16: 3
Predicted: Kentucky; Actual: West Virginia
Predicted: Kansas; Actual: Michigan State
Predicted: Syracuse; Actual: Butler
8: 3
Predicted: Kansas; Actual: Butler
4: 1
2: 0
18


18

In [22]:
over_the_years_LR = [check_results(i, LinearRegression()) for i in range(2001,2016)]
over_the_years_Lasso = [check_results(i, LassoCV(max_iter = 10000)) for i in range(2001, 2016)]
over_the_years_Lasso_plus = [check_results(i, LassoCV(max_iter = 100000, alphas = np.logspace(-4, -0.5, 30))) for i in range(2001, 2016)]
print(over_the_years_LR)
print(over_the_years_Lasso)
print(over_the_years_Lasso_plus)

  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':
  if precompute == 'auto':


[11, 18, 9, 14, 17, 19, 12, 12, 11, 18, 16, 13, 13, 18, 12]
[9, 15, 12, 10, 17, 19, 12, 14, 12, 15, 19, 10, 10, 14, 12]
[8, 15, 12, 10, 13, 17, 10, 12, 10, 15, 19, 12, 10, 12, 11]




In [23]:
print(np.mean(over_the_years_LR), np.std(over_the_years_LR))
print(np.mean(over_the_years_Lasso), np.std(over_the_years_Lasso))
print(np.mean(over_the_years_Lasso_plus), np.std(over_the_years_Lasso_plus))

14.2 3.08112533554
13.3333333333 3.09120616517
12.4 2.87054001888


In [24]:
years = [i for i in range(2001, 2016)]
training_data_list = [data for year, data in zip(years, final_data_frames)]
data_frame = pd.read_csv("2016_data.csv")
extra = pd.get_dummies(data_frame.Seed)
test_df = pd.concat([data_frame.drop(["Seed"], axis = 1), extra], axis = 1)
training_df = pd.concat(training_data_list)
win_model = LassoCV(max_iter = 10000)
train_X = training_df.drop(['Wins', 'Name'], axis = 1)
train_y = training_df.Wins
win_model.fit(train_X, train_y)
print(win_model.score(train_X, train_y))
test_X = test_df.drop(['Wins', 'Name'], axis = 1)
test_y = test_df.Wins
print(np.column_stack([win_model.predict(test_X),\
                       test_df.Name, test_df.Wins]))

0.503661622651
[[1.308452888797424 'Arizona' 0]
 [0.5814627819292673 'Arkansas-Little Rock' 0]
 [0.10607442961509772 'Austin Peay' 0]
 [1.084242393075904 'Baylor' 0]
 [0.4733439823165062 'Buffalo' 0]
 [0.5913843945804986 'Butler' 0]
 [0.23932352935100765 'Cal State Bakersfield' 0]
 [0.9104190440714488 'University of California' 0]
 [0.73105308532236 'Chattanooga' 0]
 [0.8542408587270565 'Cincinnati' 0]
 [0.9397845411417904 'Colorado' 0]
 [1.2479056472214323 'UConn' 0]
 [0.588326662478293 'Dayton' 0]
 [1.162870949502107 'Duke' 0]
 [0.5349273828619552 'Florida Gulf Coast' 0]
 [0.7406891147601762 'Fresno State' 0]
 [1.2609091059147888 'Gonzaga' 0]
 [0.9092312782810659 'Green Bay' 0]
 [-0.5600232163253263 'Hampton' 0]
 [0.5230516137695069 'Hawaii' 0]
 [-0.9889702398589817 'Holy Cross' 0]
 [1.2302354983953512 'Indiana' 0]
 [0.3646497752446205 'Iona' 0]
 [1.0558911215931515 'Iowa State' 0]
 [0.7749946762116435 'Iowa' 0]
 [1.8634335361583014 'Kansas' 0]
 [1.541033732212525 'Kentucky' 0]
 [1.1

  if precompute == 'auto':


In [25]:
win_model_plus = LassoCV(max_iter = 10000, alphas = np.logspace(-4, -0.5, 30))
win_model_plus.fit(train_X, train_y)
print(win_model_plus.score(train_X, train_y))
print(np.column_stack([win_model_plus.predict(test_X),\
                       test_df.Name, test_df.Wins]))

  if precompute == 'auto':


0.512526809052
[[1.2659766094962226 'Arizona' 0]
 [0.614731686494765 'Arkansas-Little Rock' 0]
 [0.04182261931805131 'Austin Peay' 0]
 [1.066256887106836 'Baylor' 0]
 [0.41969493901481325 'Buffalo' 0]
 [0.5341659124236422 'Butler' 0]
 [0.19456493232198468 'Cal State Bakersfield' 0]
 [0.8747514385318231 'University of California' 0]
 [0.7462583187324014 'Chattanooga' 0]
 [0.8091836375203503 'Cincinnati' 0]
 [0.9094797734191555 'Colorado' 0]
 [1.238656313778323 'UConn' 0]
 [0.5853152468999294 'Dayton' 0]
 [1.1199053021453027 'Duke' 0]
 [0.4626165873079904 'Florida Gulf Coast' 0]
 [0.7392768101419982 'Fresno State' 0]
 [1.233121342517478 'Gonzaga' 0]
 [0.8355941533611997 'Green Bay' 0]
 [-0.6396384006241727 'Hampton' 0]
 [0.49528309621272015 'Hawaii' 0]
 [-1.0502392112242243 'Holy Cross' 0]
 [1.1877024503231457 'Indiana' 0]
 [0.29223567212546 'Iona' 0]
 [1.0034549654170872 'Iowa State' 0]
 [0.717897997149322 'Iowa' 0]
 [1.883355681890058 'Kansas' 0]
 [1.511158791626345 'Kentucky' 0]
 [1.1

In [None]:
#Won't converge, so don't run!

step_size = 0.1
possible_l1_ratios = np.arange(0, 1 + step_size, step = step_size)
length_of_path = 10 ** -7
num_possible_alphas = 10
number_folds = 10
selection_strategy = 'cyclic'
num_possible_iterations = 500000
elastic_model = ElasticNetCV(l1_ratio = possible_l1_ratios,\
                             eps = length_of_path,\
                             n_alphas = num_possible_alphas,\
                             cv = number_folds,\
                             selection = selection_strategy,\
                             max_iter = num_possible_iterations)
elastic_model.fit(train_X, train_y)
print(elastic_model.score(train_X, train_y))
mapping = {list(test_df.Name)[i] : list(elastic_model.predict(test_X))[i] for i in range(len(list(test_df.Name)))}

In [None]:
#svr = GridSearchCV(SVR(kernel = 'poly'), cv = 5,\
#                   param_grid = {"C" : np.logspace(0, 4, 5),\
#                                 "gamma" : np.logspace(-2, 2, 5)})
#                   param_grid = {"degree" : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]})
svr = SVR(kernel = "rbf", C = 1, gamma = 0.1)
svr.fit(train_X, train_y)
print(svr.score(train_X, train_y), svr.score(test_X, test_y))
mapping = {list(test_df.Name)[i] : list(svr.predict(test_X))[i] for i in range(len(list(test_df.Name)))}