In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

In [14]:
# get path from data folder
base_path = '../data/enchantments2022.csv'
raw = pd.read_csv(base_path)
raw

Unnamed: 0,Preferred Entry Date 1,Preferred Division 1,Minimum Acceptable Group Size 1,Maximum Requested Group Size 1,Preferred Entry Date 2,Preferred Division 2,Minimum Acceptable Group Size 2,Maximum Requested Group Size 2,Preferred Entry Date 3,Preferred Division 3,Minimum Acceptable Group Size 3,Maximum Requested Group Size 3,Results Status,Awarded Preference,Awarded Entry Date,Awarded Entrance Code/Name,Awarded Group Size
0,9/2/2022,Core Enchantment Zone,8,8,8/26/2022,Colchuck Zone,8.0,8.0,9/16/2022,Core Enchantment Zone,8.0,8.0,Unsuccessful,,,,
1,8/15/2022,Colchuck Zone,2,2,8/24/2022,Colchuck Zone,2.0,2.0,8/29/2022,Colchuck Zone,2.0,2.0,Unsuccessful,,,,
2,8/12/2022,Snow Zone,8,8,8/19/2022,Snow Zone,8.0,8.0,8/3/2022,Snow Zone,8.0,8.0,Unsuccessful,,,,
3,7/12/2022,Core Enchantment Zone,2,2,7/20/2022,Core Enchantment Zone,2.0,2.0,7/13/2022,Snow Zone,2.0,2.0,Unsuccessful,,,,
4,9/3/2022,Stuart Zone,4,4,8/28/2022,Stuart Zone,4.0,4.0,8/21/2022,Stuart Zone,4.0,4.0,Unsuccessful,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36822,6/28/2022,Core Enchantment Zone,2,2,8/15/2022,Core Enchantment Zone,3.0,3.0,8/22/2022,Core Enchantment Zone,4.0,4.0,Unsuccessful,,,,
36823,8/1/2022,Colchuck Zone,8,8,7/27/2022,Core Enchantment Zone,8.0,8.0,8/6/2022,Core Enchantment Zone,8.0,8.0,Unsuccessful,,,,
36824,7/11/2022,Core Enchantment Zone,4,4,8/22/2022,Core Enchantment Zone,4.0,4.0,8/2/2022,Core Enchantment Zone,4.0,4.0,Unsuccessful,,,,
36825,6/24/2022,Core Enchantment Zone,6,6,8/26/2022,Core Enchantment Zone,8.0,8.0,9/2/2022,Core Enchantment Zone,8.0,8.0,Unsuccessful,,,,


In [130]:
df = raw[raw['Results Status'] != 'Applied']
df = df[df['Results Status'] != 'Cancelled']
df = df.drop(columns=['Awarded Preference', 'Awarded Entry Date', 'Awarded Entrance Code/Name', 'Awarded Group Size'])

In [72]:
# Drop second and third preferences
#df = df.drop(columns=['Preferred Division 2', 'Preferred Division 3', 'Preferred Entry Date 2', 'Minimum Acceptable Group Size 2', 
#                      'Maximum Requested Group Size 2', 'Preferred Entry Date 3', 'Minimum Acceptable Group Size 3', 'Maximum Requested Group Size 3'])

In [132]:
# convert date to datetime
df['Preferred Entry Date 1'] = pd.to_datetime(df['Preferred Entry Date 1'])
timestamps = np.array([dt.timestamp() for dt in df['Preferred Entry Date 1']])
df['Preferred Entry Date 1'] = timestamps

# drop rows with missing values
df = df.dropna()

# convert date to datetime
df['Preferred Entry Date 2'] = pd.to_datetime(df['Preferred Entry Date 2'])
timestamps = np.array([dt.timestamp() for dt in df['Preferred Entry Date 2']])
df['Preferred Entry Date 2'] = timestamps

# convert date to datetime
df['Preferred Entry Date 3'] = pd.to_datetime(df['Preferred Entry Date 3'])
timestamps = np.array([dt.timestamp() for dt in df['Preferred Entry Date 3']])
df['Preferred Entry Date 3'] = timestamps


In [133]:
# replace successful with 1 and unsuccessful with 0
df['Results Status'] = df['Results Status'].replace('Awarded', 1)
df['Results Status'] = df['Results Status'].replace('Unsuccessful', 0)

In [134]:
# One hot encode Preferred Division 1 using torch
df['Preferred Division 1'] = df['Preferred Division 1'].astype('category')
df['Preferred Division 1'] = df['Preferred Division 1'].cat.codes

# One hot encode Preferred Division 2 using torch
df['Preferred Division 2'] = df['Preferred Division 2'].astype('category')
df['Preferred Division 2'] = df['Preferred Division 2'].cat.codes

# One hot encode Preferred Division 3 using torch
df['Preferred Division 3'] = df['Preferred Division 3'].astype('category')
df['Preferred Division 3'] = df['Preferred Division 3'].cat.codes


In [206]:
# Split into train and test
train = df.sample(frac=0.95, random_state=20)
test = df.drop(train.index)

In [207]:
# Split into features and labels
train_features = train.copy()
test_features = test.copy()

train_labels = train_features.pop('Results Status')
test_labels = test_features.pop('Results Status')

# Convert to tensors
train_features = torch.tensor(train_features.values)
train_labels = torch.tensor(train_labels.values)
test_features = torch.tensor(test_features.values)
test_labels = torch.tensor(test_labels.values)

# Create dataset
train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_features, test_labels)

# Create dataloader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [208]:
train_features.shape, train_labels.shape, test_features.shape, test_labels.shape

(torch.Size([33514, 12]),
 torch.Size([33514]),
 torch.Size([1764, 12]),
 torch.Size([1764]))

In [220]:
# Create model
model = torch.nn.Sequential(
    torch.nn.Linear(12, 4),
    torch.nn.Tanh(),
    torch.nn.Linear(4, 1),
    torch.nn.Sigmoid()
)

# Create optimizer
optimizer = torch.optim.Adamax(model.parameters(), lr=0.000001)

# Create loss function
loss_fn = torch.nn.BCELoss()

# Create learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

In [221]:
# Train model

for epoch in range(50):
    # Calculate train accuracy
    correct = 0
    total = 0

    cum_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data.float())
        loss = loss_fn(output, target.float().unsqueeze(1))
        cum_loss += loss.item()
        loss.backward()
        optimizer.step()

        for idx, i in enumerate(output):
            if torch.argmax(i) == target[idx]:
                correct += 1
            total += 1

    train_acc = correct/total

    # Test model
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            X, y = data
            output = model(X.float())
            for idx, i in enumerate(output):
                if torch.argmax(i) == y[idx]:
                    correct += 1
                total += 1

    scheduler.step(cum_loss/len(train_loader))
        
    print("Epoch: ", epoch + 1, "Loss: ", cum_loss/len(train_loader), "Train Accuracy: ", train_acc, "Test Accuracy: ", correct/total)

Epoch:  1 Loss:  0.3562721485454314 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  2 Loss:  0.35496197052673395 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  3 Loss:  0.3537058533660552 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  4 Loss:  0.35248372645821946 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  5 Loss:  0.3512326384501127 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  6 Loss:  0.35003622137276824 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  7 Loss:  0.3488312963654147 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  8 Loss:  0.34761733254840094 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  9 Loss:  0.3464372614118535 Train Accuracy:  0.9299994032344692 Test Accuracy:  0.9319727891156463
Epoch:  10 Loss:  0.3452913442773