In [78]:
# library for working with dataframes (matrics)
import pandas as pd

In [79]:
ls datasets/kaggle/stage_2

[31mCities.csv[m[m*                         [31mRegularSeasonCompactResults.csv[m[m*
[31mConferenceTourneyGames.csv[m[m*         [31mRegularSeasonDetailedResults.csv[m[m*
[31mConferences.csv[m[m*                    [31mSeasons.csv[m[m*
[31mGameCities.csv[m[m*                     [31mSecondaryTourneyCompactResults.csv[m[m*
[31mNCAATourneyCompactResults.csv[m[m*      [31mSecondaryTourneyTeams.csv[m[m*
[31mNCAATourneyDetailedResults.csv[m[m*     [31mTeamCoaches.csv[m[m*
[31mNCAATourneySeedRoundSlots.csv[m[m*      [31mTeamConferences.csv[m[m*
[31mNCAATourneySeeds.csv[m[m*               [31mTeamSpellings.csv[m[m*
[31mNCAATourneySlots.csv[m[m*               [31mTeams.csv[m[m*


In [80]:
# Define file paths
FILE_PATH = 'datasets/kaggle/stage_2/'
NCAATourneyCompactResults_path = FILE_PATH + 'NCAATourneyCompactResults.csv'
NCAATourneySeeds_path = FILE_PATH + 'NCAATourneySeeds.csv'
RegularSeasonCompactResults_path = FILE_PATH + 'RegularSeasonCompactResults.csv'

In [81]:
# Create dataframes from files
NCAATourneyCompactResults = pd.read_csv(NCAATourneyCompactResults_path)
NCAATourneySeeds = pd.read_csv(NCAATourneySeeds_path)
RegularSeasonCompactResults = pd.read_csv(RegularSeasonCompactResults_path)

In [82]:
# Look at dataframes
NCAATourneyCompactResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [83]:
NCAATourneySeeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [84]:
RegularSeasonCompactResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


## Create a 'seed' DataFrame

In [85]:
# Define a function that combines the values of two DataFrame columns,
# combining them into a single column with the values separated by an underscore
def get_id(p_row, p_column_names):
    column_values = [p_row[col] for col in p_column_names]
    return '_'.join(map(str, column_values))


# Make a copy of the NCAATourneySeeds DataFrame
seed = NCAATourneySeeds.copy()
# Add an 'id' column to the 'seed' DataFrame,
# using the get_id function to combine 'Season' and 'TeamID'
seed['id'] = seed.apply(lambda row: get_id(row, ['Season', 'TeamID']), axis=1)
# Extract the seed number from the 'Seed' value
seed['seed'] = seed['Seed'].apply(lambda x: int(x[1:3]))
# Extract the region from the 'Seed' value
seed['region'] = seed['Seed'].apply(lambda x: x[0])
# Rename the 'Season' column to 'season' and 'TeamID' to 'team'
seed = seed.rename(columns={'Season': 'season', 'TeamID': 'team'})
# Redefine the 'seed' DataFrame with the columns in this order
seed = seed[['id', 'season', 'team', 'region', 'seed']]
# Sort the 'seed' DataFrame by the 'id' column we created
seed = seed.sort_values(by=['id'])
seed.head()

Unnamed: 0,id,season,team,region,seed
22,1985_1104,1985,1104,X,7
25,1985_1112,1985,1112,X,10
24,1985_1116,1985,1116,X,9
58,1985_1120,1985,1120,Z,11
42,1985_1130,1985,1130,Y,11


In [86]:
# Look at NCAATourneyCompactResults
NCAATourneyCompactResults.head()
# Note 

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


## Create a 'final_four' DataFrame

In [87]:
# Create a 'tourney' DataFrame based on NCAATourneyCompactResults
tourney = NCAATourneyCompactResults.copy()
# Create a 'winners' DataFrame
winners = tourney[['Season', 'DayNum', 'WTeamID', 'WScore', 'NumOT']].copy()
# Rename the columns
winners = winners.rename(columns={'WTeamID': 'TeamID', \
                                  'WScore': 'Score'
                                 })
# Create a 'losers' DataFrame
losers = tourney[['Season', 'DayNum', 'LTeamID', 'LScore', 'NumOT']].copy()
# Rename the columns
losers = losers.rename(columns={'LTeamID': 'TeamID', \
                                'LScore': 'Score'
                               })
# Combine the 'winners' and 'losers' DataFrames
results = pd.concat([winners, losers])
# Create a 'final_four' DataFrame for only the teams that played on day 152
final_four = results.loc[results['DayNum'] == 152].copy()
# Create an 'id' column
final_four['id'] = final_four.apply(lambda row: get_id(row, ['Season', 'TeamID']), axis=1)
# Sort the DataFrame by the 'id' column
final_four = final_four[['id']].sort_values(by=['id'])
# Create a 'final_four' column, set it to 1
final_four['final_four'] = str(1) # string
# final_four['final_four'] = 1 # numeric
# Show the first 5 rows
final_four.head()

Unnamed: 0,id,final_four
60,1985_1207,1
61,1985_1272,1
60,1985_1385,1
61,1985_1437,1
123,1986_1181,1


## Combine 'seed' and 'final_four' DataFrames

In [88]:
# Merge the 'seed' and 'final_four' DataFrames.
# Left: Keep all the rows from the 'seed' DataFrame,
# even when there's no match in 'final_four'
dataset = pd.merge(seed, final_four, how='left', on='id')
dataset.head()

Unnamed: 0,id,season,team,region,seed,final_four
0,1985_1104,1985,1104,X,7,
1,1985_1112,1985,1112,X,10,
2,1985_1116,1985,1116,X,9,
3,1985_1120,1985,1120,Z,11,
4,1985_1130,1985,1130,Y,11,


In [89]:
# Set all the 'final_four' column values that aren't 1, to 0
dataset['final_four'] = dataset['final_four'].fillna(value=str(0)) # string
# dataset['final_four'] = dataset['final_four'].fillna(value=0) # numeric
# Show the dataset the way we want it
dataset

Unnamed: 0,id,season,team,region,seed,final_four
0,1985_1104,1985,1104,X,7,0
1,1985_1112,1985,1112,X,10,0
2,1985_1116,1985,1116,X,9,0
3,1985_1120,1985,1120,Z,11,0
4,1985_1130,1985,1130,Y,11,0
5,1985_1173,1985,1173,Z,9,0
6,1985_1177,1985,1177,W,10,0
7,1985_1181,1985,1181,Y,3,0
8,1985_1192,1985,1192,Z,16,0
9,1985_1207,1985,1207,W,1,1


## Logistic regression

In [90]:
# Create a historical DataFrame (exclude 2019)
dataset_historical = dataset.loc[dataset['season'] != 2019]
dataset_historical.head()

Unnamed: 0,id,season,team,region,seed,final_four
0,1985_1104,1985,1104,X,7,0
1,1985_1112,1985,1112,X,10,0
2,1985_1116,1985,1116,X,9,0
3,1985_1120,1985,1120,Z,11,0
4,1985_1130,1985,1130,Y,11,0


In [91]:
# Createa 2019 DataFrame
dataset_2019 = dataset.loc[dataset['season'] == 2019]
dataset_2019.head()

Unnamed: 0,id,season,team,region,seed,final_four
2218,2019_1101,2019,1101,Y,15,0
2219,2019_1113,2019,1113,X,11,0
2220,2019_1120,2019,1120,Y,5,0
2221,2019_1124,2019,1124,X,9,0
2222,2019_1125,2019,1125,W,11,0


In [92]:
# We'll use 'X' for our predictors and 'y' for our target
# We just want the values, and we want it as an array instead of a DataFrame
X = dataset_historical.iloc[:, 4:5].values
X

array([[ 7],
       [10],
       [ 9],
       ...,
       [ 4],
       [14],
       [ 1]])

In [93]:
y = dataset_historical.iloc[:, 5].values
y

array(['0', '0', '0', ..., '0', '0', '0'], dtype=object)

In [94]:
# Split X and y into training and testing datasets

# # Via train_test_split
# from sklearn.model_selection import train_test_split
# X_train, X_test, \
# y_train, y_test = \
# train_test_split(X, \
#                  y, \
#                  test_size=0.3, \
#                  random_state=0 \
#                 )

# Via permutation
import numpy as np
# length of the dataset
length = dataset_historical.shape[0]
# We only want 70% of the data for the training set
train_index_stop = int(length * 0.7)
# Create a permutation
permutation = np.random.RandomState(0).permutation(length)
# Take just the values up to the train_index_stop
train_permutation = permutation[:train_index_stop]
# Take the values after the train_index_stop
test_permutation = permutation[train_index_stop:]

# Create the training and testing data
X_train = X[train_permutation]
X_test = X[test_permutation]
y_train = y[train_permutation]
y_test = y[test_permutation]

In [95]:
X_train

array([[ 9],
       [ 7],
       [10],
       ...,
       [ 6],
       [ 6],
       [15]])

In [96]:
y_train

array(['0', '0', '0', ..., '0', '0', '0'], dtype=object)

In [97]:
# We want the predictors to be on the same scale
# (mean of 0, standard deviation of 1)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Don't worry about the warning
X_train



array([[ 0.07074208],
       [-0.35729575],
       [ 0.28476099],
       ...,
       [-0.57131467],
       [-0.57131467],
       [ 1.35485557]])

In [98]:
# Fit a LogisticRegression object to training set
from sklearn.linear_model import LogisticRegression
classifierObj = LogisticRegression(random_state=0)
classifierObj.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [99]:
y_train

array(['0', '0', '0', ..., '0', '0', '0'], dtype=object)

In [100]:
# Making predictions on the test set
y_pred = classifierObj.predict(X_test)

In [101]:
# Evaluating the predictions using a confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[628   0]
 [ 38   0]]


In [102]:
# So, we predicted 0 out of 43 correctly.
# This is to be expected, because the model, by default,
# will only predict a class if it has a greater than
# 50% probability. None of these teams do.

In [103]:
# Predict the probability of [0, 1]
prob = classifierObj.predict_proba(X_test)
prob

array([[0.99570463, 0.00429537],
       [0.98996664, 0.01003336],
       [0.8323262 , 0.1676738 ],
       ...,
       [0.8323262 , 0.1676738 ],
       [0.97674256, 0.02325744],
       [0.99880349, 0.00119651]])

In [104]:
test_predicted_probability = prob[:, 1]
print(test_predicted_probability.shape)
# Predicted y probabilities
test_predicted_probability

(666,)


array([0.00429537, 0.01003336, 0.1676738 , 0.00078094, 0.1676738 ,
       0.1676738 , 0.11616253, 0.1676738 , 0.03521208, 0.00078094,
       0.1676738 , 0.32124712, 0.00183281, 0.23592982, 0.02325744,
       0.00078094, 0.0152971 , 0.03521208, 0.32124712, 0.05297826,
       0.1676738 , 0.00280654, 0.1676738 , 0.00078094, 0.01003336,
       0.23592982, 0.02325744, 0.01003336, 0.0065688 , 0.03521208,
       0.00119651, 0.00429537, 0.02325744, 0.00078094, 0.00280654,
       0.03521208, 0.00183281, 0.0065688 , 0.02325744, 0.00429537,
       0.23592982, 0.11616253, 0.05297826, 0.23592982, 0.11616253,
       0.1676738 , 0.23592982, 0.0152971 , 0.0152971 , 0.1676738 ,
       0.32124712, 0.32124712, 0.07897458, 0.05297826, 0.00119651,
       0.03521208, 0.0065688 , 0.02325744, 0.1676738 , 0.23592982,
       0.02325744, 0.03521208, 0.32124712, 0.00078094, 0.00119651,
       0.1676738 , 0.05297826, 0.03521208, 0.0152971 , 0.02325744,
       0.32124712, 0.00429537, 0.1676738 , 0.00429537, 0.07897

In [105]:
# Actual y values
y_test

array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1',
       '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0