<a href="https://colab.research.google.com/github/YiiiGao/STA561_Final_Project/blob/main/STA561_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, ndcg_score
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
import itertools
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
raw_dataset = pd.read_csv('CollegeBasketballPlayers2009-2021.csv', low_memory=False)

In [3]:
new_dataset = raw_dataset.loc[raw_dataset['pick'] >= 1]
trainset = new_dataset.loc[new_dataset['year'] < 2021]
testset = new_dataset.loc[raw_dataset['year'] == 2021]

In [4]:
features_basic = ['treb', 'ast', 'stl', 'blk', 'pts', 'yr']
features_advanced = ['eFG', 'TS_per', 'FT_per', 'twoP_per', 'TP_per', 'ast/tov', 
                     'obpm', 'dbpm', 'oreb', 'dreb', 'TO_per', 'ORB_per', 'DRB_per',
                     'AST_per', 'blk_per', 'stl_per', 'Min_per']
trainset['yr'] = trainset['yr'].rank(method='dense', ascending=True).astype(int)
testset['yr'] = testset['yr'].rank(method='dense', ascending=True).astype(int)
trainset['ast/tov'] = trainset['ast/tov'].fillna(trainset['ast/tov'].value_counts().index[1])
X_train = np.asarray(trainset[features_basic + features_advanced])
X_test = np.asarray(testset[features_basic + features_advanced])
y_train_pick = np.asarray(trainset.pick)
y_train_year = np.asarray(trainset.year)
y_test = np.asarray(testset.pick)

#Regression

In [None]:
ball_model = RandomForestRegressor()
ball_model.fit(X_train, y_train_pick)

y_pred = ball_model.predict(X_test)
y_pred = y_pred.argsort().argsort()
y_test = y_test.argsort().argsort()
print(y_pred)
print(y_test)
mae = mean_absolute_error(y_pred, y_test)
print("MAE: {:,.5f}".format(mae))

[43 41 12 45 17 23 47 40 11 31 33 34 15 46 10 44 16 32  8 19 27 26 48 30
 18 20 39 36 28 25 22  1  6 37  9  4 38  3  0  2 24 13 29  5  7 14 42 21
 35]
[34 44 11 47  5 42 27 40 19 12 26 48 43 37 32 36 30 45 33 31  7  4 38 20
  9 13 46 24 28 25 14  0  2  8 18 39  6  3  1 10 29 15 22 21 23 17 35 16
 41]
MAE: 10.93878


# Ranking

In [5]:
y_train = np.c_[y_train_pick, y_train_year]

In [6]:
def transform_pairwise(X, y):
    X_new = []
    y_new = []
    y = np.asarray(y)
    if y.ndim == 1:
        y = np.c_[y, np.ones(y.shape[0])]
    perm = itertools.permutations(range(X.shape[0]), 2)
    for k, (i, j) in enumerate(perm):
        if y[i][0] == y[j][0] or y[i][1] != y[j][1]:
            continue
        X_new.append(np.concatenate((X[i], X[j]), axis=None))
        y_new.append(np.sign(y[i][0] - y[j][0]))
    return np.asarray(X_new), np.asarray(y_new).ravel()

In [7]:
def calc_ndcg(y_pred, y_test):
    scores = np.zeros(y_test.shape)
    for i in range(len(scores)):
        scores[i] = np.sum(y_pred[i * y_test.shape[0]: (i + 1) * y_test.shape[0]])
    rank_pred = scores.argsort().argsort()
    return rank_pred, ndcg_score([y_test.argsort().argsort()], [rank_pred])

In [8]:
X_new, y_new = transform_pairwise(X_train, y_train)
X_test_new, y_test_new = transform_pairwise(X_test, y_test)
rank_model = XGBClassifier()
rank_model.fit(X_new, y_new)
y_pred = rank_model.predict(X_test_new)
print(y_pred)
rank_pred, score = calc_ndcg(y_pred, y_test)
print(score)

[-1.  1. -1. ...  1. -1. -1.]
0.9285381718884151


##Neural Network

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [31]:
#model architecture 
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 48.
        self.layer_1 = nn.Linear(46, 128) 
        self.layer_2 = nn.Linear(128, 128)
        self.layer_out = nn.Linear(128, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.3)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        #x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.dropout(x)
        #x = self.dropout(x)
        x = self.layer_out(x)
        return x

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BinaryClassification()
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [11]:
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_new, y_new, train_size=0.2)
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = TrainData(torch.FloatTensor(X_train_nn), 
                       torch.FloatTensor(np.where(y_train_nn == -1, 0, y_train_nn)))
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)

class ValidationData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data

    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
val_data = ValidationData(torch.FloatTensor(X_test_nn))
val_loader = DataLoader(dataset=val_data, batch_size=1)

class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(X_test_new))
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [12]:
def calc_train_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [13]:
def calc_val_acc(model, dataloader):
    y_pred_list = []
    model.eval()
    with torch.no_grad():
        for X_batch in dataloader:
            X_batch = X_batch.to(device)
            y_test_pred = model(X_batch)
            y_test_pred = torch.sigmoid(y_test_pred)
            y_pred_tag = torch.round(y_test_pred)
            y_pred_list.append(y_pred_tag.cpu().numpy())

    y_pred_list = np.array([int(a.squeeze().tolist()) for a in y_pred_list])
    acc = accuracy_score(np.where(y_test_nn == -1, 0, y_test_nn), y_pred_list)
    return acc

In [32]:
model.train()
best_accuracy = 0
train_accuracy_list = []
val_accuracy_list = []
epochs = 100
for e in range(epochs):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = calc_train_acc(y_pred, y_batch.unsqueeze(1))
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    if e % 10 == 0:
        val_accuracy = calc_val_acc(model, val_loader)
        val_accuracy_list.append(val_accuracy)
        train_accuracy_list.append(epoch_acc/len(train_loader))
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'rank.pt') # save best model
        print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | train acc: {epoch_acc/len(train_loader):.3f} | val acc: {val_accuracy:.3f}')

Epoch 000: | Loss: 0.70153 | train acc: 58.430 | val acc: 0.648
Epoch 010: | Loss: 0.61972 | train acc: 65.554 | val acc: 0.665
Epoch 020: | Loss: 0.61727 | train acc: 65.452 | val acc: 0.663
Epoch 030: | Loss: 0.61609 | train acc: 65.784 | val acc: 0.645
Epoch 040: | Loss: 0.61071 | train acc: 66.202 | val acc: 0.655
Epoch 050: | Loss: 0.61535 | train acc: 66.024 | val acc: 0.667
Epoch 060: | Loss: 0.61061 | train acc: 66.196 | val acc: 0.630
Epoch 070: | Loss: 0.61014 | train acc: 66.326 | val acc: 0.651
Epoch 080: | Loss: 0.61965 | train acc: 65.172 | val acc: 0.664
Epoch 090: | Loss: 0.61199 | train acc: 66.120 | val acc: 0.665


In [30]:
best_state_dict = torch.load('rank.pt')
model.load_state_dict(best_state_dict) 
y_pred = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred.append(y_pred_tag.cpu().numpy())

y_pred = np.array([int(i.squeeze().tolist()) for i in y_pred])
print(y_test_new)
print(y_pred)
print('Pairwise accuracy: ', accuracy_score(y_test_new, np.where(y_pred == 0, -1, y_pred)))
rank, score = calc_ndcg(np.where(y_pred == 0, -1, y_pred), y_test)
print("NDCG score: ", score)

[-1.  1. -1. ...  1.  1.  1.]
[0 1 0 ... 1 1 0]
Pairwise accuracy:  0.6207482993197279
NDCG score:  0.8964692007356542


In [18]:
df = pd.DataFrame(data={'Name': testset['player_name'].values,
                        'Rank': np.array([item + 1 for item in rank_pred])})
df.to_csv('Rank.csv')