In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from data_prep import clean_data
#from get_stats import read_data_from_files #nbastuffer_dataframe

In [2]:
stats_df = clean_data() # preprocessed, clean data
stats_df

Unnamed: 0,POS,AGE,GP,MPG,USG%,FTA,FT%,2PA,2P%,3PA,...,TS%,PPG,RPG,APG,SPG,BPG,TPG,VI,ORTG,DRTG
0,G,25.69,31,19.0,12.2,13,0.923,30,0.500,127,...,0.507,5.3,1.5,0.6,0.55,0.19,0.45,3.5,103.1,103.9
1,F,28.51,10,12.3,9.2,10,0.700,3,0.667,15,...,0.379,1.7,2.5,0.8,0.10,0.40,0.40,4.9,87.1,98.5
2,G,22.93,34,12.6,13.5,9,0.778,36,0.361,74,...,0.474,3.2,1.8,1.9,0.38,0.15,0.82,7.0,99.5,108.1
3,C,25.73,80,33.4,16.4,292,0.500,807,0.596,2,...,0.591,13.9,9.5,1.6,1.49,0.96,1.73,7.1,119.9,102.7
5,F,22.19,19,10.2,9.9,4,1.000,13,0.385,23,...,0.424,1.7,1.0,0.3,0.05,0.21,0.32,3.1,85.3,115.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2957,F,20.30,19,11.6,11.0,4,0.750,18,0.500,35,...,0.411,2.4,0.9,0.6,0.40,0.50,0.10,3.8,97.3,113.2
2958,G,25.90,25,10.3,10.9,9,0.667,41,0.488,16,...,0.500,2.4,0.9,1.2,0.40,0.10,0.20,5.3,117.6,114.5
2963,G,24.80,23,12.0,11.6,3,0.667,58,0.431,2,...,0.424,2.3,1.6,1.7,0.30,0.10,0.70,6.1,93.2,114.2
2970,G,25.30,5,13.4,10.0,0,0.000,8,0.500,6,...,0.393,2.2,1.8,1.0,0.80,0.60,0.20,4.7,94.4,98.0


In [3]:
features = list(stats_df)[1:]
X, y = stats_df[features], stats_df['POS']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0, stratify = y)

In [10]:
stats_df["POS"] = LabelEncoder().fit_transform(stats_df["POS"])

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from tqdm import trange


X = stats_df.drop(["POS"], axis = 1)
y = stats_df["POS"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

batch_size = 256
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train.values), torch.tensor(Y_train.values))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=True)

#X_test = torch.tensor(X_test.values)
#Y_test = torch.tensor(Y_test.values)

test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test.values), torch.tensor(Y_test.values))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=True)

In [20]:
input_size = 21
hidden_size = 10
output_size = 3


class FFNN(nn.Module):

    def __init__(self):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, x):
        #print('x', x)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        #x = F.relu(self.fc1(x))
        #x = F.relu(self.fc2(x))
        x = F.relu(self.fc(x))
        x = F.softmax(x, dim=1)
        return x

model = FFNN()
print(model)

FFNN(
  (fc1): Linear(in_features=21, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=3, bias=True)
  (fc): Linear(in_features=21, out_features=3, bias=True)
)


In [21]:

def train_one_epoch(train_loader, model, device, optimizer, log_interval, epoch):
    model.train()
    losses = []
    counter = []
    
    for i, (img, label) in enumerate(train_loader):
        img, label = img.float().to(device), label.to(device)
        optimizer.zero_grad()
        #print(img.shape, img)
        #print(label.shape, label)
        outputs = model(img)
        #print(outputs.shape, outputs)
        loss = torch.nn.CrossEntropyLoss()(outputs, label)
        loss.backward()
        optimizer.step()
    
        # Record training loss every log_interval and keep counter of total training images seen
        if (i+1) % log_interval == 0:
            losses.append(loss.item())
            counter.append(
                (i * batch_size) + img.size(0) + epoch * len(train_loader.dataset))

    return losses, counter

def test_one_epoch(test_loader, model, device):
    model.eval()
    test_loss = 0
    num_correct = 0
    
    with torch.no_grad():
        for i, (img, label) in enumerate(test_loader):
            output = model(img.float())
            preds = [np.argmax(row) for row in output]
            num_correct += sum([1 for pred, true in zip(preds, label) if pred == true])
            #pred = torch.round(output)  # round probability into binary classification
            #num_correct += pred.eq(label).sum().item()
            test_loss /= len(test_loader)
            
    test_loss /= len(test_loader.dataset)
    return test_loss, num_correct

In [27]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
# Hyperparameters
lr = 0.005
max_epochs=20
gamma = 0.95

# Recording data
log_interval = 1

# Instantiate optimizer (model was created in previous cell)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

train_losses = []
train_counter = []
test_losses = []
test_correct = []
for epoch in trange(max_epochs, leave=True, desc='Epochs'):
    train_loss, counter = train_one_epoch(train_loader, model, DEVICE, optimizer, log_interval, epoch)
    test_loss, num_correct = test_one_epoch(test_loader, model, DEVICE)

    # Record results
    train_losses.extend(train_loss)
    train_counter.extend(counter)
    test_losses.append(test_loss)
    test_correct.append(num_correct)


Epochs: 100%|██████████| 20/20 [01:05<00:00,  3.25s/it]


In [29]:
print(f"Test accuracy: {test_correct[-1]/len(test_loader.dataset)}")
test_correct

Test accuracy: 0.5300261096605744


[203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203,
 203]

In [19]:
len(test_loader.dataset)

383