# Neural Networks Final Project -- Jeopardy Model

In [1]:
# Run this block to load important libraries and set things up
import torch
from torch import nn
import numpy as np
import scipy.signal
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'

## set up / data pre-processing

In [32]:
data_df = pd.read_csv('JEOPARDY_CSV.csv')

print(data_df.shape)
data_df.head()

(216930, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [16]:
# FOR DEBUGGING
data_df = data_df.iloc[:10000,:]

### polish & remove outlier data 

In [33]:
data_df = data_df[data_df[' Value'].notnull()] # remove null values 

def extract_val(x):
    x = x.replace(',', '')
    x = x.replace('$', '')
    return int(x)

values = data_df[' Value'].apply(extract_val) # converting value strings to ints
data_df[' Value'] = values

data_df = data_df[data_df[' Value'] <= 2000] # removing data with values over 2000 #TODO: bin value data?

data_df[' Question'] = data_df[' Question'].str.lower() # no repeat characters
print(data_df.shape)
data_df

(210452, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,200,"for the last 8 years of his life, galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,no. 2: 1912 olympian; football star at carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200,the city of yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,200,"in 1963, live on ""the art linkletter show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,200,"signer of the dec. of indep., framer of the co...",John Adams
...,...,...,...,...,...,...,...
216924,4999,2006-05-11,Double Jeopardy!,OFF-BROADWAY,2000,in 2006 the cast of this long-running hit emba...,Stomp
216925,4999,2006-05-11,Double Jeopardy!,RIDDLE ME THIS,2000,this puccini opera turns on the solution to 3 ...,Turandot
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",2000,in north america this term is properly applied...,a titmouse
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,2000,"in penny lane, where this ""hellraiser"" grew up...",Clive Barker


## make vocab dictionary

In [34]:
chars = set() 
for question in data_df[' Question']: 
    chars.update(question)

possible_vals = sorted(set(data_df[ ' Value']))

# create mappings
char_to_idx = {ch:i for i,ch in enumerate(chars)}
idx_to_char = {i:ch for i,ch in enumerate(chars)}

print(char_to_idx)
# remove newlines!
char_to_idx['\n'] = char_to_idx[' ']

vocab_size = len(chars)

# printing some stuff
print("Characters:", chars)
print("Number of unique characters:", vocab_size)
print("All possible values:", possible_vals)

data_df[' Question'] = data_df[' Question'].apply(lambda x: [char_to_idx[ch] for ch in x]) # converting string to int array

{'ö': 0, 'e': 1, '-': 2, 'ç': 3, 'm': 4, '>': 5, 'ê': 6, '£': 7, '^': 8, '#': 9, '¿': 10, '–': 11, '3': 12, 'ì': 13, 's': 14, '¾': 15, '<': 16, 'à': 17, 'r': 18, '0': 19, 'x': 20, '7': 21, ' ': 22, 'í': 23, 'j': 24, '@': 25, 'ô': 26, '|': 27, '5': 28, "'": 29, '’': 30, 'u': 31, 'è': 32, '"': 33, 'ñ': 34, '¢': 35, '(': 36, 'v': 37, 'k': 38, 'h': 39, '.': 40, '=': 41, ';': 42, 'd': 43, 'q': 44, '6': 45, 'o': 46, '?': 47, '[': 48, ',': 49, 'î': 50, 'z': 51, '8': 52, '1': 53, ')': 54, '$': 55, ']': 56, 'ü': 57, 'é': 58, 'g': 59, 'â': 60, 't': 61, 'f': 62, '‘': 63, '”': 64, 'y': 65, '2': 66, '9': 67, '&': 68, 'b': 69, '+': 70, 'p': 71, 'l': 72, '4': 73, 'c': 74, '—': 75, 'ã': 76, '“': 77, '*': 78, '½': 79, '`': 80, '…': 81, '°': 82, 'a': 83, 'w': 84, '!': 85, 'i': 86, '%': 87, 'á': 88, '²': 89, 'n': 90, '/': 91, 'ó': 92, '_': 93, 'º': 94, '¼': 95, 'å': 96, 'ë': 97, ':': 98}
Characters: {'ö', 'e', '-', 'ç', 'm', '>', 'ê', '£', '^', '#', '¿', '–', '3', 'ì', 's', '¾', '<', 'à', 'r', '0', 'x', 

## make train/test split

In [35]:
seed = 42 
train_df = data_df.sample(frac=0.8, random_state=seed)
test_df = data_df.drop(train_df.index)

In [36]:
from torch.nn.utils.rnn import pad_sequence

train_labels = torch.tensor([possible_vals.index(x) for x in train_df[' Value'].tolist()]) # converting values to position in possible values 
train_questions = [torch.tensor(q) for q in train_df[' Question']]
train_questions_padded = pad_sequence(train_questions, batch_first=False, padding_value=44) # char_to_idx[' '] = 44

train_dataset = torch.utils.data.TensorDataset(torch.transpose(train_questions_padded, 0, 1), train_labels) 


test_labels = torch.tensor([possible_vals.index(x) for x in test_df[' Value'].tolist()])
test_questions = [torch.tensor(q) for q in test_df[' Question']]
test_questions_padded = pad_sequence(test_questions, batch_first=False, padding_value=44)

test_dataset = torch.utils.data.TensorDataset(torch.transpose(test_questions_padded, 0, 1), test_labels)

In [37]:
# FROM HW 2
from torch.utils.data.sampler import SubsetRandomSampler
ntotal = train_df.shape[0]
ntrain = int(0.9*ntotal)
nval = ntotal - ntrain

val_ix = np.random.choice(range(ntotal), size=nval, replace=False)
train_ix = list(set(range(ntotal)) - set(val_ix))

train_sampler = SubsetRandomSampler(train_ix)
val_sampler = SubsetRandomSampler(val_ix)

batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler)
test_loader = torch.utils.data.DataLoader(test_dataset)

## model

In [38]:
class JeopardyModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(JeopardyModel, self).__init__()
        self.embed = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=input_size) 
        self.lstm = torch.nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
        self.output_layer = torch.nn.Linear(hidden_size * 2, output_size)
        
    def forward(self, x, hidden_state):
        x_vector = self.embed(x)
        lstm_outputs, h_n = self.lstm(x_vector, hidden_state)
        outputs = self.output_layer(lstm_outputs)         # [:, -1, :]     
        return outputs, h_n 

## training 

In [39]:
hidden_size = 256

model = JeopardyModel(input_size=vocab_size, output_size=len(possible_vals), hidden_size=hidden_size)
loss_func = torch.nn.CrossEntropyLoss() # torch.nn.MSELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
nepoch = 10

In [40]:
for images, labels in train_loader:
    print(images.shape, labels.shape)
    model = JeopardyModel(input_size=vocab_size, output_size=len(possible_vals), hidden_size=hidden_size)
    
    batch_size = images.size(1)
    h = torch.zeros((2, batch_size, hidden_size)) 
    c = torch.zeros((2, batch_size, hidden_size))
    
    outputs, _ = model(images, (h,c))
    print(outputs.shape)
    break

torch.Size([64, 860]) torch.Size([64])
torch.Size([64, 46])


In [43]:
# FROM HW 2
from tqdm.notebook import tqdm
import torch.nn.functional as F

def train_network(model, train_loader, val_loader, criterion, optimizer, nepoch):
    try:
        for epoch in tqdm(range(nepoch)):
            print('EPOCH %d'%epoch)
            total_loss = 0
            count = 0

            h = torch.zeros((2, batch_size,hidden_size))    
            c = torch.zeros((2, batch_size,hidden_size))
            for inputs, labels in train_loader:
                if inputs.shape[0] < 64: continue # dropping last batch

                optimizer.zero_grad()
                outputs, (h,c) = model(inputs , (h.detach(), c.detach()))
                # print("o", outputs)
                # print("l", labels)
                loss = criterion(outputs, labels.long())
                loss.backward()
                optimizer.step()
                # print(loss.item())
                total_loss += loss.item()
                count += 1
            print('{:>12s} {:>7.5f}'.format('Train loss:', total_loss/count))
            with torch.no_grad():
                total_loss = 0
                count = 0

                h = torch.zeros((2, batch_size,hidden_size))  
                c = torch.zeros((2, batch_size,hidden_size))
                for inputs, labels in val_loader:
                    if inputs.shape[0] < 64: continue # dropping last batch
                    outputs, (h,c) = model(inputs , (h.detach(), c.detach()))
                    loss = criterion(outputs, labels)
                    total_loss += loss.item()
                    count += 1
                print('{:>12s} {:>7.5f}'.format('Val loss:', total_loss/count))
            print()
    except KeyboardInterrupt:
        print('Exiting from training early')
    return

In [44]:
train_network(model, train_loader, val_loader, loss_func, optimizer, nepoch)

  0%|          | 0/10 [00:00<?, ?it/s]

EPOCH 0


## test

In [18]:
def test_network(model, test_loader, mode):
    true, pred = [], []
    total = correct = 0
    with torch.no_grad():
        for inputs, labels  in test_loader:
            batch = inputs.shape[0]
            h = torch.zeros((2, batch,hidden_size))    
            c = torch.zeros((2, batch,hidden_size))

            outputs, _  = model(inputs, (h,c))
            print(outputs)
            predicted = np.argmax(outputs, axis=1) # get predicted class label for each test example.
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            true.append(labels)
            pred.append(predicted)
    acc = (100 * correct / total)
    print('%s accuracy: %0.3f' % (mode, acc))
    true = np.concatenate(true)
    pred = np.concatenate(pred)
    return acc, true, pred

In [19]:
acc, true, pred = test_network(model, test_loader, "Model")
acc, true, pred

tensor([[-0.0232,  0.0044,  0.0162, -0.0416,  0.0149, -0.0763, -0.0241,  0.0055,
         -0.0347,  0.0360, -0.0194,  0.0161,  0.0424,  0.0331]])
tensor([[-0.0232,  0.0044,  0.0162, -0.0416,  0.0149, -0.0763, -0.0241,  0.0055,
         -0.0347,  0.0360, -0.0194,  0.0161,  0.0424,  0.0331]])
tensor([[-0.0232,  0.0044,  0.0162, -0.0416,  0.0149, -0.0763, -0.0241,  0.0055,
         -0.0347,  0.0360, -0.0194,  0.0161,  0.0424,  0.0331]])
tensor([[-0.0232,  0.0044,  0.0162, -0.0416,  0.0149, -0.0763, -0.0241,  0.0055,
         -0.0347,  0.0360, -0.0194,  0.0161,  0.0424,  0.0331]])
tensor([[-0.0232,  0.0044,  0.0162, -0.0416,  0.0149, -0.0763, -0.0241,  0.0055,
         -0.0347,  0.0360, -0.0194,  0.0161,  0.0424,  0.0331]])
tensor([[-0.0232,  0.0044,  0.0162, -0.0416,  0.0149, -0.0763, -0.0241,  0.0055,
         -0.0347,  0.0360, -0.0194,  0.0161,  0.0424,  0.0331]])
tensor([[-0.0232,  0.0044,  0.0162, -0.0416,  0.0149, -0.0763, -0.0241,  0.0055,
         -0.0347,  0.0360, -0.0194,  0.0161

(0.0,
 array([ 1,  1,  5,  5,  6,  6,  7,  3,  3,  6,  9, 11, 13,  1,  3,  3,  5,
         7,  3,  3,  6,  6,  9,  9, 11, 11, 13,  1,  2,  3,  3,  4,  5,  6,
         7,  6,  7,  0,  3,  3,  3,  1,  1,  3,  3,  5,  5,  6,  1,  3,  5,
         5,  5,  6,  6,  6,  7,  9,  9, 13,  3,  6,  7,  7,  6, 11, 11, 13,
        13,  3,  4,  1,  1,  3,  3,  3,  5,  5,  8,  6,  7,  1,  3,  6,  3,
        13, 13, 13, 13,  1,  1,  3,  5,  5,  5,  5,  7,  3,  3,  3,  6,  9,
         9, 11, 11,  0,  1,  3,  4,  1,  3,  3,  5,  5,  5, 13,  5,  5,  6,
         7,  7,  1,  5,  7,  6,  6,  1,  1,  2, 10,  4,  4,  3,  5,  6,  6,
         7,  1,  1,  1,  3,  3,  7,  7,  3,  3,  3,  3,  6,  9,  9, 11, 13,
         3,  3,  5,  5,  5,  5,  6,  6,  7,  7,  6,  6, 11, 11,  1,  1,  3,
         3,  7,  5,  7,  7,  3,  6, 11, 11, 13, 13,  0,  0,  1,  2,  1,  3,
        13,  7,  4,  1,  1,  3,  6,  4]),
 array([12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 

## transformers

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)
        
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.decoder = nn.Linear(d_model, len(possible_vals))
        
    def forward(self, src, tgt):
        src = self.embedding(src)
        src = self.pos_encoder(src)
        tgt = tgt.unsqueeze(1).unsqueeze(-1).expand(-1, src.size(1), src.size(2))
        output = self.transformer(src, tgt)
        output = self.decoder(output)
        return output

In [2]:
def train(model, criterion, optimizer, dataloader, device):
    model.train()
    total_loss = 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(inputs, labels.float())
        loss = criterion(output[:,-1,:], labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, criterion, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs, labels.float())
            loss = criterion(output[:,-1,:], labels)
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [3]:
vocab_size = len(chars)  # Example vocab size, replace with actual vocab size
d_model = vocab_size # 32  # Dimension of the model
nhead = 4  # Number of attention heads
num_encoder_layers = 2  # Number of encoder layers
num_decoder_layers = 2  # Number of decoder layers
dim_feedforward = 128  # Dimension of the feedforward network
dropout = 0.1  # Dropout rate

model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Step 3: Train and test the model
num_epochs = 5

for epoch in range(num_epochs):
    print("Epoch: ", epoch)
    train_loss = train(model, criterion, optimizer, train_loader, 'cpu')
    print("train loss = ", train_loss)
    test_loss = evaluate(model, criterion, test_loader, 'cpu')
    print("test loss = ", test_loss)

NameError: name 'chars' is not defined

In [None]:
def test_network(model, test_loader, mode):
    true, pred = [], []
    total = correct = 0
    with torch.no_grad():
        for inputs, labels  in test_loader:

            outputs = model(inputs, labels.float())
            predicted = np.argmax(outputs, axis=1) # get predicted class label for each test example.
            # predicted = outputs[:, -1, :]
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            true.append(labels)
            pred.append(predicted)
    acc = (100 * correct / total)
    print('%s accuracy: %0.3f' % (mode, acc))
    true = np.concatenate(true)
    pred = np.concatenate(pred)
    return acc, true, pred

In [None]:
acc, true, pred = test_network(model, test_loader, "Model")
acc, true, pred