<a href="https://colab.research.google.com/github/Zarif123/SSLM-Project/blob/main/chess_model_50k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers
!pip install python-chess

In [2]:
from datetime import datetime

from google.colab import drive
drive.mount('/content/gdrive')
folder = "/content/gdrive/MyDrive/Statistical_LM_Group_Folder"

csv_file = "chess_data_with_buckets.csv"
csv_path = f"{folder}/{csv_file}"


now = datetime.now()
dt_string = now.strftime("%m_%d_%Y_%H_%M")

model_file = "classifier_model.pth"
datetime = dt_string
model_path = f"{folder}/{datetime}_{model_file}"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, get_linear_schedule_with_warmup
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from imblearn.under_sampling import RandomUnderSampler

###Parameters

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

params = {
    "batch_size": 4,
    "epochs": 5,
    "learning_rate": 0.00003,
    "warmup_steps": 0.01,
    "epsilon": 1e-8,
    "accum_iter": 8,
    "num_classes": 6,
    "dropout": 0.5
}

class Params:
  def __init__(self, **kwargs):
    for key, value in kwargs.items():
      setattr(self, key, value)

params = Params(**params)

### Classify Model

In [5]:
class BertClassifier(nn.Module):
  def __init__(self, dropout=params.dropout, num_classes=params.num_classes):
    super(BertClassifier, self).__init__()

    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768, num_classes)
    # self.relu = nn.ReLU()
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input_id, mask):
    _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
    dropout_output = self.dropout(pooled_output)
    linear_output = self.linear(dropout_output)
    final_layer = self.softmax(linear_output)

    return final_layer

### Dataset Class

In [6]:
class ChessDataset(torch.utils.data.Dataset):
  def __init__(self, moves, labels):
      tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
      self.labels = torch.Tensor(labels.values)
      self.moves = [tokenizer(move, 
                              padding='max_length', max_length = 256, truncation=True,
                              return_tensors="pt") for move in moves]

  def __len__(self):
      return len(self.labels)

  def __getitem__(self, idx):
      return self.moves[idx], self.labels[idx]

### Training Function

In [7]:
def train(model, train_dataloader, val_dataloader, criterion, optimizer):
    num_batches = len(train_dataloader)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = params.warmup_steps, num_training_steps = num_batches * params.epochs)
    print(f"Number of batches: {num_batches}")
    for epoch_num in range(params.epochs):
        total_acc_train = 0
        total_loss_train = 0

        for batch_idx, (train_input, train_label) in enumerate(train_dataloader):
        #   print(sum(train_input['input_ids'].squeeze(1)[0] == 0))
          train_label = train_label.to(device)
          mask = train_input['attention_mask'].to(device)
          input_id = train_input['input_ids'].squeeze(1).to(device)

          output = model(input_id, mask)
          
          batch_loss = criterion(output, train_label.long())
          total_loss_train += batch_loss.item()
          
          acc = (output.argmax(dim=1) == train_label).sum().item()
          total_acc_train += acc

          batch_loss.backward()

          # Gradient Accumulation
          if ((batch_idx + 1) % params.accum_iter == 0) or (batch_idx + 1 == num_batches):
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

          if batch_idx % 50 == 0:
            print(f"Batch Number: {batch_idx}")
            print(f"Model Output: {torch.exp(output).tolist()}")
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

          for val_input, val_label in val_dataloader:
            val_label = val_label.to(device)
            mask = val_input['attention_mask'].to(device)
            input_id = val_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            
            batch_loss = criterion(output, val_label.long())
            total_loss_val += batch_loss.item()
            
            acc = (output.argmax(dim=1) == val_label).sum().item()
            total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / (params.batch_size * len(train_dataloader)): .3f} \
            | Train Accuracy: {total_acc_train / (params.batch_size * len(train_dataloader)): .3f} \
            | Val Loss: {total_loss_val / (params.batch_size * len(val_dataloader)): .3f} \
            | Val Accuracy: {total_acc_val / (params.batch_size * len(val_dataloader)): .3f}')
        
        torch.save(model.state_dict(), model_path)

### Test Function

In [8]:
def test(model, test_dataloader):
  total_acc_test = 0
  with torch.no_grad():
    for test_input, test_label in test_dataloader:
      test_label = test_label.to(device)
      mask = test_input['attention_mask'].to(device)
      input_id = test_input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)
      print(f"Model Output: {torch.exp(output).tolist()}")
      print(f"Prediction: {output.argmax(dim=1).tolist()}, Truth: {test_label.tolist()}")

      acc = (output.argmax(dim=1) == test_label).sum().item()
      total_acc_test += acc
       
      print(f"Test Accuracy: {total_acc_test / len(test_dataloader): .3f}")

### Loading Data

In [9]:
%%capture
chess_data = pd.read_csv(csv_path)[0:50000]
X = chess_data["Moves"]
y = chess_data["Bucket"]

balancer = RandomUnderSampler()
X = X.values.reshape(-1, 1)
X, y = balancer.fit_resample(X, y)
rating_counts = y.value_counts().sort_index()
X = X.flatten()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, shuffle=True) # Splits into train/test
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1, shuffle=True) # Splits train into train/val

train_data, val_data, test_data = ChessDataset(X_train, y_train), ChessDataset(X_val, y_val), ChessDataset(X_test, y_test)

In [10]:
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=params.batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=params.batch_size)
test_dataloader = torch.utils.data.DataLoader(test_data)

### Initialize Model

In [11]:
%%capture
model = BertClassifier()
model = model.to(device)
criterion = nn.NLLLoss()
optimizer = optim.AdamW(model.parameters(), lr = params.learning_rate)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Run Training

In [12]:
train(model, train_dataloader, val_dataloader, criterion, optimizer)

Number of batches: 2292
Batch Number: 0
Model Output: [[0.11288940161466599, 0.17289741337299347, 0.24391284584999084, 0.2786654233932495, 0.040371425449848175, 0.15126341581344604], [0.11338809877634048, 0.27353596687316895, 0.20497630536556244, 0.19404925405979156, 0.08288697898387909, 0.13116341829299927], [0.1417040228843689, 0.15040242671966553, 0.09545133262872696, 0.24968171119689941, 0.1523793786764145, 0.21038122475147247], [0.10611627250909805, 0.19385838508605957, 0.3376048803329468, 0.12869688868522644, 0.14324387907981873, 0.09047969430685043]]
Batch Number: 50
Model Output: [[0.07277684658765793, 0.1259813755750656, 0.21772126853466034, 0.21929126977920532, 0.19855830073356628, 0.16567103564739227], [0.1417275369167328, 0.1342216283082962, 0.13522346317768097, 0.2196359783411026, 0.13611608743667603, 0.2330753356218338], [0.16899089515209198, 0.1403016895055771, 0.0742308497428894, 0.17880654335021973, 0.10488918423652649, 0.3327808976173401], [0.2035016119480133, 0.21663

### Run Testing

In [13]:
#test_model_name = "05_31_2023_08_04_classifier_model.pth"
#test_model_path = f"{folder}/{test_model_name}"

test_model = BertClassifier()
test_model = test_model.to(device)
test_model.load_state_dict(torch.load(model_path))
test(test_model, test_dataloader)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Prediction: [4], Truth: [2.0]
Test Accuracy:  0.135
Model Output: [[0.55080246925354, 0.2438398152589798, 0.1622779816389084, 0.03093107044696808, 0.005257998127490282, 0.006890757009387016]]
Prediction: [0], Truth: [0.0]
Test Accuracy:  0.136
Model Output: [[0.013421724550426006, 0.03584130480885506, 0.08170710504055023, 0.16909991204738617, 0.3524433970451355, 0.34748658537864685]]
Prediction: [4], Truth: [5.0]
Test Accuracy:  0.136
Model Output: [[0.009634742513298988, 0.02822927013039589, 0.08292979747056961, 0.08551797270774841, 0.17085838317871094, 0.622829794883728]]
Prediction: [5], Truth: [4.0]
Test Accuracy:  0.136
Model Output: [[0.09593446552753448, 0.1983995884656906, 0.26617521047592163, 0.2263532429933548, 0.1277642548084259, 0.08537333458662033]]
Prediction: [2], Truth: [3.0]
Test Accuracy:  0.136
Model Output: [[0.021250272169709206, 0.06413603574037552, 0.15816055238246918, 0.25450125336647034, 0.2937950