In [35]:
# library for working with dataframes (matrics)
import pandas as pd

In [36]:
ls datasets/kaggle/stage_2

[31mCities.csv[m[m*                         [31mRegularSeasonCompactResults.csv[m[m*
[31mConferenceTourneyGames.csv[m[m*         [31mRegularSeasonDetailedResults.csv[m[m*
[31mConferences.csv[m[m*                    [31mSeasons.csv[m[m*
[31mGameCities.csv[m[m*                     [31mSecondaryTourneyCompactResults.csv[m[m*
[31mNCAATourneyCompactResults.csv[m[m*      [31mSecondaryTourneyTeams.csv[m[m*
[31mNCAATourneyDetailedResults.csv[m[m*     [31mTeamCoaches.csv[m[m*
[31mNCAATourneySeedRoundSlots.csv[m[m*      [31mTeamConferences.csv[m[m*
[31mNCAATourneySeeds.csv[m[m*               [31mTeamSpellings.csv[m[m*
[31mNCAATourneySlots.csv[m[m*               [31mTeams.csv[m[m*


In [37]:
# Define file paths
FILE_PATH = 'datasets/kaggle/stage_2/'
NCAATourneyCompactResults_path = FILE_PATH + 'NCAATourneyCompactResults.csv'
NCAATourneySeeds_path = FILE_PATH + 'NCAATourneySeeds.csv'
RegularSeasonCompactResults_path = FILE_PATH + 'RegularSeasonCompactResults.csv'

In [38]:
# Create dataframes from files
NCAATourneyCompactResults = pd.read_csv(NCAATourneyCompactResults_path)
NCAATourneySeeds = pd.read_csv(NCAATourneySeeds_path)
RegularSeasonCompactResults = pd.read_csv(RegularSeasonCompactResults_path)

In [39]:
# Look at dataframes
NCAATourneyCompactResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [40]:
NCAATourneySeeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [41]:
RegularSeasonCompactResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


## Create a 'seed' DataFrame

In [42]:
# Define a function that combines the values of two DataFrame columns,
# combining them into a single column with the values separated by an underscore
def get_id(p_row, p_column_names):
    column_values = [p_row[col] for col in p_column_names]
    return '_'.join(map(str, column_values))


# Make a copy of the NCAATourneySeeds DataFrame
seed = NCAATourneySeeds.copy()
# Add an 'id' column to the 'seed' DataFrame,
# using the get_id function to combine 'Season' and 'TeamID'
seed['id'] = seed.apply(lambda row: get_id(row, ['Season', 'TeamID']), axis=1)
# Extract the seed number from the 'Seed' value
seed['seed'] = seed['Seed'].apply(lambda x: int(x[1:3]))
# Extract the region from the 'Seed' value
seed['region'] = seed['Seed'].apply(lambda x: x[0])
# Rename the 'Season' column to 'season' and 'TeamID' to 'team'
seed = seed.rename(columns={'Season': 'season', 'TeamID': 'team'})
# Redefine the 'seed' DataFrame with the columns in this order
seed = seed[['id', 'season', 'team', 'region', 'seed']]
# Sort the 'seed' DataFrame by the 'id' column we created
seed = seed.sort_values(by=['id'])
seed.head()

Unnamed: 0,id,season,team,region,seed
22,1985_1104,1985,1104,X,7
25,1985_1112,1985,1112,X,10
24,1985_1116,1985,1116,X,9
58,1985_1120,1985,1120,Z,11
42,1985_1130,1985,1130,Y,11


In [43]:
# Look at NCAATourneyCompactResults
NCAATourneyCompactResults.head()
# Note 

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


## Create a 'final_four' DataFrame

In [44]:
# Create a 'tourney' DataFrame based on NCAATourneyCompactResults
tourney = NCAATourneyCompactResults.copy()
# Create a 'winners' DataFrame
winners = tourney[['Season', 'DayNum', 'WTeamID', 'WScore', 'NumOT']].copy()
# Rename the columns
winners = winners.rename(columns={'WTeamID': 'TeamID', \
                                  'WScore': 'Score'
                                 })
# Create a 'losers' DataFrame
losers = tourney[['Season', 'DayNum', 'LTeamID', 'LScore', 'NumOT']].copy()
# Rename the columns
losers = losers.rename(columns={'LTeamID': 'TeamID', \
                                'LScore': 'Score'
                               })
# Combine the 'winners' and 'losers' DataFrames
results = pd.concat([winners, losers])
# Create a 'final_four' DataFrame for only the teams that played on day 152
final_four = results.loc[results['DayNum'] == 152].copy()
# Create an 'id' column
final_four['id'] = final_four.apply(lambda row: get_id(row, ['Season', 'TeamID']), axis=1)
# Sort the DataFrame by the 'id' column
final_four = final_four[['id']].sort_values(by=['id'])
# Create a 'final_four' column, set it to 1
final_four['final_four'] = str(1) # string
# final_four['final_four'] = 1 # numeric
# Show the first 5 rows
final_four.head()

Unnamed: 0,id,final_four
60,1985_1207,1
61,1985_1272,1
60,1985_1385,1
61,1985_1437,1
123,1986_1181,1


## Combine 'seed' and 'final_four' DataFrames

In [45]:
# Merge the 'seed' and 'final_four' DataFrames.
# Left: Keep all the rows from the 'seed' DataFrame,
# even when there's no match in 'final_four'
dataset = pd.merge(seed, final_four, how='left', on='id')
dataset.head()

Unnamed: 0,id,season,team,region,seed,final_four
0,1985_1104,1985,1104,X,7,
1,1985_1112,1985,1112,X,10,
2,1985_1116,1985,1116,X,9,
3,1985_1120,1985,1120,Z,11,
4,1985_1130,1985,1130,Y,11,


In [46]:
# Set all the 'final_four' column values that aren't 1, to 0
dataset['final_four'] = dataset['final_four'].fillna(value=str(0)) # string
# dataset['final_four'] = dataset['final_four'].fillna(value=0) # numeric
# Show the dataset the way we want it
dataset

Unnamed: 0,id,season,team,region,seed,final_four
0,1985_1104,1985,1104,X,7,0
1,1985_1112,1985,1112,X,10,0
2,1985_1116,1985,1116,X,9,0
3,1985_1120,1985,1120,Z,11,0
4,1985_1130,1985,1130,Y,11,0
5,1985_1173,1985,1173,Z,9,0
6,1985_1177,1985,1177,W,10,0
7,1985_1181,1985,1181,Y,3,0
8,1985_1192,1985,1192,Z,16,0
9,1985_1207,1985,1207,W,1,1


## Logistic regression

In [47]:
# Create a historical DataFrame (exclude 2019)
dataset_historical = dataset.loc[dataset['season'] != 2019]
dataset_historical.head()

Unnamed: 0,id,season,team,region,seed,final_four
0,1985_1104,1985,1104,X,7,0
1,1985_1112,1985,1112,X,10,0
2,1985_1116,1985,1116,X,9,0
3,1985_1120,1985,1120,Z,11,0
4,1985_1130,1985,1130,Y,11,0


In [48]:
# Createa 2019 DataFrame
dataset_2019 = dataset.loc[dataset['season'] == 2019]
dataset_2019.head()

Unnamed: 0,id,season,team,region,seed,final_four
2218,2019_1101,2019,1101,Y,15,0
2219,2019_1113,2019,1113,X,11,0
2220,2019_1120,2019,1120,Y,5,0
2221,2019_1124,2019,1124,X,9,0
2222,2019_1125,2019,1125,W,11,0


In [49]:
# We'll use 'X' for our predictors and 'y' for our target
# We just want the values, and we want it as an array instead of a DataFrame
X = dataset_historical.iloc[:, 4:5].values
X

array([[ 7],
       [10],
       [ 9],
       ...,
       [ 4],
       [14],
       [ 1]])

In [50]:
y = dataset_historical.iloc[:, 5].values
y

array(['0', '0', '0', ..., '0', '0', '0'], dtype=object)

In [51]:
# Split X and y into training and testing datasets
from sklearn.model_selection import train_test_split
X_train, X_test, \
y_train, y_test = \
train_test_split(X, \
                 y, \
                 test_size=0.3, \
                 random_state=0 \
                )

In [52]:
X_train

array([[ 5],
       [ 2],
       [12],
       ...,
       [ 3],
       [ 8],
       [15]])

In [53]:
y_train

array(['0', '0', '0', ..., '0', '0', '0'], dtype=object)

In [54]:
# We want the predictors to be on the same scale
# (mean of 0, standard deviation of 1)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Don't worry about the warning
X_train



array([[-0.76140158],
       [-1.40444271],
       [ 0.73902773],
       ...,
       [-1.19009567],
       [-0.11836045],
       [ 1.38206886]])

In [55]:
# Fit a LogisticRegression object to training set
from sklearn.linear_model import LogisticRegression
classifierObj = LogisticRegression(random_state=0)
classifierObj.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [56]:
y_train

array(['0', '0', '0', ..., '0', '0', '0'], dtype=object)

In [57]:
# Making predictions on the test set
y_pred = classifierObj.predict(X_test)

In [58]:
# Evaluating the predictions using a confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[626   0]
 [ 40   0]]


In [59]:
# So, we predicted 0 out of 43 correctly.
# This is to be expected, because the model, by default,
# will only predict a class if it has a greater than
# 50% probability. None of these teams do.

In [60]:
# Predict the probability of [0, 1]
prob = classifierObj.predict_proba(X_test)
prob

array([[0.98396647, 0.01603353],
       [0.9644351 , 0.0355649 ],
       [0.98928431, 0.01071569],
       ...,
       [0.99859609, 0.00140391],
       [0.88845877, 0.11154123],
       [0.9928512 , 0.0071488 ]])

In [63]:
length = dataset_historical.shape[0]
length

2218

In [71]:
train_index_stop = int(length * 0.7)
train_index_stop

1552

In [64]:
import numpy as np

In [72]:
permutation = np.random.RandomState(0).permutation(length)
permutation

array([1444,   98, 1598, ...,  763,  835, 1653])

In [75]:
train_permutation = permutation[:train_index_stop]
train_permutation

array([1444,   98, 1598, ..., 1520,   46,  747])

In [76]:
test_permutation = permutation[train_index_stop:]
test_permutation

array([1746,  753,  462,  673, 1682, 1463, 1201, 2203,  424,  325,  664,
        350,  447, 1625, 1041, 1221, 1712, 1149,  387, 1369, 1743, 1337,
        754,  138,  451, 1194,  699, 1157, 2043,  830, 1102,  950, 1019,
       2110, 1370,  396, 1413, 1409, 1143, 1897, 1177, 1843, 1253, 2044,
         21, 1305,  766,  392,  250, 1267, 1796, 1059,  671, 2087,  845,
       1829, 1646, 1353, 1133, 1403,  869,  604, 2217, 1933,  627,  902,
       1606,  856, 1195, 1262,  700, 1768, 1889,  709,  448,  321, 1932,
        696, 1179, 2051,  624, 2065,  125, 1011, 1480, 1090,  848,  780,
        605,  174, 1883, 1202, 1996,  415,  100, 1270, 2006,  741,  334,
       1974, 2003,  834,  833, 1244, 1415,  594,  939, 1333, 2078,  169,
       1551, 1688, 1269, 1542, 2068, 1739, 2200,  640,  104,  815, 1789,
       1597,  998, 1075, 1626, 1378,  719,  460, 1046, 1398, 1562, 1242,
        585, 2151, 2064, 1389, 1979, 1229, 1522,  497, 1540,  702,  375,
        975, 1209, 1880,  404,  419, 1425, 1734, 19

In [77]:
import numpy as np
length = dataset_historical.shape[0]
permutation = np.random.RandomState(0).permutation(length)
train_permutation = permutation[:train_index_stop]
test_permutation = permutation[train_index_stop:]


In [79]:
dataset_historical.loc[train_permutation]

Unnamed: 0,id,season,team,region,seed,final_four
1444,2007_1277,2007,1277,W,9,0
98,1986_1298,1986,1298,W,7,0
1598,2009_1425,2009,1425,Y,10,0
768,1997_1112,1997,1112,X,4,1
39,1985_1326,1985,1326,Y,4,0
570,1993_1435,1993,1435,Z,3,0
2215,2018_1455,2018,1455,W,4,0
1287,2005_1130,2005,1130,X,4,0
1051,2001_1232,2001,1232,Z,13,0
942,1999_1373,1999,1373,Z,13,0
