# GloVe Classifier

This is a version of the News Classifier that uses GloVe vectors for classification. GloVe vectors were downloaded from [here](https://nlp.stanford.edu/projects/glove/). The vectors are stored in the `data` folder of this repo, which is not commited to source code.

We implement two models in this notebook: the first uses a simple averaging of word vectors while the other uses a TF-IDF weighting of word vectors.

The weighted / non-weighted averaging of word embeddings is then passed into a logistic regression model for class prediction.

**Note: There are some random processes within this notebook, so different runs of the notebook may result in different outcomes.**

**Note: This notebook assumes the data being loaded has already been randomly shuffled.**

In [9]:
import data_utils
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from data_utils import WordTokenDataset, WordTokenDatasetSample
from time import time
from torch.utils.data import Dataset, DataLoader


## Load and Setup the Data


In [10]:
embeddings = data_utils.load_embeddings('./data/glove.6B/glove.6B.100d.txt',
                                        embedding_dim=100)


In [11]:
data = pd.read_json('./data/train_data.json', orient='records')


In [12]:
accepted_tokens = {t for t in embeddings.index}
dataset = WordTokenDataset(data, accepted_tokens=accepted_tokens)


In [13]:
start_time = time()

dataset.prepare()

end_time = time()

print(f'Ran in {(end_time - start_time)/60:.02f}m.')


Ran in 0.30m.


## Hyperparameter Configurations

In [14]:
# Doing a basic hyper-parameter grid search.

hyperparams_list = [
#     { 'type': 'uniform', 'lr': 0.001,  'batch_size': 100 },
#     { 'type': 'uniform', 'lr': 0.01,   'batch_size': 100 },
#     { 'type': 'uniform', 'lr': 0.001,  'batch_size': 10  },
    { 'type': 'uniform', 'lr': 0.01,   'batch_size': 10  },
#     { 'type': 'tf_idf',  'lr': 0.001,  'batch_size': 100 },
#     { 'type': 'tf_idf',  'lr': 0.01,   'batch_size': 100 },
#     { 'type': 'tf_idf',  'lr': 0.001,  'batch_size': 10  },
#     { 'type': 'tf_idf',  'lr': 0.01,   'batch_size': 10  },
]


## Create the Model

In [15]:
class Model(torch.nn.Module):
    def __init__(self, embeddings, n_classes, weighting='uniform'):
        super(Model, self).__init__()

        self.embedding_bag = torch.nn.EmbeddingBag.from_pretrained(embeddings.clone(), mode='sum')
        self.linear = torch.nn.Linear(self.embedding_bag.embedding_dim, n_classes)


    def forward(self, samples):
        weights = samples.create_uniform_weights()
        x = self.embedding_bag(samples.sequence, samples.offset, per_sample_weights=weights)
        output = self.linear(x)
        return output


    def predict(self, samples):
        with torch.no_grad():
            weights = samples.create_uniform_weights()
            outputs = self(samples.sequence, samples.offset, weights)
            predictions = torch.argmax(outputs, axis=1)

        return predictions



## Training the Model

In [17]:
def train(model, criterion, optimizer, dataset, data_loader, epochs, log=True):
    train_losses = []

    for epoch in range(epochs):
        losses = []

        for samples in data_loader:
            optimizer.zero_grad()
            output = model(samples)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

            losses.append(loss)

        train_loss = torch.mean(torch.stack(losses))
        train_losses.append(train_loss)

        if log and (epoch + 1) % 1 == 0:
            train_loss_estimator_size = 1000
            train_loss_estimator_start = max(1, len(dataset) - train_loss_estimator_size)
            random_start = torch.randint(high=train_loss_estimator_start)

            samples = dataset[random_start:(random_start+train_loss_estimator_size)]
            predictions = model.predict(samples)
            labels = samples['label']

            total = len(labels)
            correct = torch.sum(labels == predictions)

            print(f'Epoch {epoch + 1}')
            print(f'Accuracy: {float(correct)/total*100:.02f}%.')
            print(f'Training Loss: {train_loss.item()}')
            print()
        
    return train_losses
    

In [18]:
# Non-Weighted Model Training.

epochs = 5

models = []
train_losses_list = []
valid_losses = []

accepted_tokens = {t for t in embeddings.index}

for i, hyperparams in enumerate(hyperparams_list):
    print(f'Starting training Model {i+1} / {len(hyperparams_list)}...')

    start_time = time()

    batch_size = hyperparams['batch_size']
    lr = hyperparams['lr']

    # 1. Setup Data Loader

    dataset = WordTokenDataset(data, accepted_tokens=accepted_tokens)

    data_loader = DataLoader(dataset=dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             collate_fn=data_utils.collate_samples)

    # 2. Create the Model

    model = Model(embeddings=embeddings, n_classes=len(label_types))

    # 3. Setup Criterion and Optimizer

    criterion = torch.nn.CrossEntropyLoss()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # 4. Train the Model

    train_losses = train(model,
                         criterion,
                         optimizer,
                         dataset,
                         data_loader,
                         epochs)
    
    # 5. Calculate Validation Loss
    
    with torch.no_grad():
        valid_dataset = WordTokenDataset(valid_sequence, valid_offsets, valid_labels, valid_weights)
        valid_samples = valid_dataset.all_samples()

        outputs = model(valid_samples)

        valid_loss = criterion(outputs, valid_labels)
        valid_losses.append(valid_loss)

    end_time = time()

    models.append(model)
    train_losses_list.append(train_losses)

    print(f'Model completed in {(end_time - start_time)/60:.02f}m.')
    print()


Starting training Model 1 / 8...


KeyError: 'weights'

## Check Results

In [None]:
uniform_mask = [hp['type'] == 'uniform' for hp in hyperparams_list]

uniform_models = [m for i, m in enumerate(models) if uniform_mask[i]]
uniform_train_losses_list = [losses for i, losses in enumerate(train_losses_list) if uniform_mask[i]]
uniform_valid_losses = [loss.item() for i, loss in enumerate(valid_losses) if uniform_mask[i]]

tf_idf_models = [m for i, m in enumerate(models) if not uniform_mask[i]]
tf_idf_train_losses_list = [losses for i, losses in enumerate(train_losses_list) if not uniform_mask[i]]
tf_idf_valid_losses = [loss.item() for i, loss in enumerate(valid_losses) if not uniform_mask[i]]


In [None]:
# Training loss of uniform models.

for i, model in enumerate(uniform_models):
    train_losses = uniform_train_losses_list[i]
    plt.plot(train_losses)

plt.legend([f'Model {i+1}' for i in range(len(uniform_models))])
plt.show()


In [None]:
# Training loss of tf-idf models.

for i, model in enumerate(tf_idf_models):
    train_losses = tf_idf_train_losses_list[i]
    plt.plot(train_losses)

plt.legend([f'Model {i+1}' for i in range(len(tf_idf_models))])
plt.show()


### Find the Best Models

We will grab the "best" model trained on uniform weights and tf-idf weights. This will be based on which model scores the lowest validation loss.

In [None]:
# Grab the best models.

best_uniform_model_idx = uniform_valid_losses.index(min(uniform_valid_losses))
best_uniform_model = uniform_models[best_uniform_model_idx]

best_tf_idf_model_idx = tf_idf_valid_losses.index(min(tf_idf_valid_losses))
best_tf_idf_model = tf_idf_models[best_tf_idf_model_idx]

print(f'Best Uniform Model: {best_uniform_model_idx+1}')
print(f'Best TF-IDF Model:  {best_tf_idf_model_idx+1}')


### Accuracy

In [None]:
uniform_dataset = DocDataset(valid_sequence, valid_offsets, valid_labels, valid_uniform_weights)

uniform_samples = uniform_dataset.all_samples()

uniform_predictions = best_uniform_model.predict(uniform_samples['data'], uniform_samples['offset'], uniform_samples['weights'])

total = len(uniform_samples['label'])
correct = torch.sum(uniform_predictions == uniform_samples['label'])

print(f'Accuracy of Uniform Model: {(float(correct) / total)*100:.02f}%.')


In [None]:
tf_idf_dataset = DocDataset(valid_sequence, valid_offsets, valid_labels, valid_tf_idf_weights)

tf_idf_samples = tf_idf_dataset.all_samples()

tf_idf_predictions = best_tf_idf_model.predict(tf_idf_samples['data'], tf_idf_samples['offset'], tf_idf_samples['weights'])

total = len(tf_idf_samples['label'])
correct = torch.sum(tf_idf_predictions == tf_idf_samples['label'])

print(f'Accuracy of TF-IDF Model: {(float(correct) / total)*100:.02f}%.')


### Confusion Matrix

In [None]:
def create_confusion_matrix(labels, predictions):
    # Displaying a confusion matrix of the validation results for our model.

    categories = labels.unique()
    category_encoder = { c.item():i for i,c in enumerate(categories) }

    confusion_matrix = np.random.rand(len(categories), len(categories))

    for i, category in enumerate(categories):
        row = np.zeros(len(categories))

        cat_mask = (labels == category.item()).tolist()
        cat_preds = predictions[cat_mask]
        
        for category in categories:
            pred_count = torch.sum(cat_preds == category.item())
            row[category_encoder[category.item()]] = pred_count
            
        confusion_matrix[i, :] = row / len(cat_preds)

    return confusion_matrix, category_encoder


In [None]:
def show_confusion_matrix(confusion_matrix):
    fig = plt.figure()
    ax = fig.add_subplot(111)

    cax = ax.matshow(confusion_matrix)

    fig.colorbar(cax)

    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    

In [None]:
# Confusion Matrix for Uniform Model

uniform_confusion_matrix, category_encoder = create_confusion_matrix(valid_labels, uniform_predictions)
show_confusion_matrix(uniform_confusion_matrix)


In [None]:
# Confusion Matrix for TF-IDF Model

tf_idf_confusion_matrix, category_encoder = create_confusion_matrix(valid_labels, tf_idf_predictions)
show_confusion_matrix(tf_idf_confusion_matrix)

## Persist Models

In [None]:
torch.save(best_uniform_model.state_dict(), './models/uniform_glove_model.torch')
torch.save(best_tf_idf_model.state_dict(), './models/tf_idf_model.torch')
