In [2]:
import torch
import json
import pandas as pd
from toolz import curry
import numpy as np

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# until I properly package this library, need to allow for relative imports
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from bertron.train import input_dict, evaluate

seed = 666

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## load model and checkpoint

In [110]:
with open('../data/processed/labels.json', 'r') as f_in:
    label_dict = json.load(f_in)
    
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

# load transfer learning component for Enron corpus
checkpoint = torch.load('../bertron/models/finetuned_BERT_epoch_1.model')
model.load_state_dict(checkpoint)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### until I make a proper pipeline, regenerate tokenizer and dataloaders manually

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

max_len = 512
tokenizer_cur = curry(tokenizer.batch_encode_plus)(
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=max_len,
    truncation=True,
    return_tensors='pt'
)

In [5]:
X = pd.read_parquet('../data/processed/X.parquet').sample(frac=.05)

seed = 666
X_train, X_test, y_train, y_test = train_test_split(
    X.index.values, 
    X['label'].values, 
    test_size=0.15, 
    random_state=seed, 
    stratify=X['label'].values
)

X['data_type'] = np.nan * X.shape[0]
X.loc[X_train, 'data_type'] = 'train'
X.loc[X_test, 'data_type'] = 'test'

In [6]:
mask_test = X['data_type']=='test'
encoded_data_test = tokenizer_cur(
    X[mask_test]['body'].values
)

batch_size = 4
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(X[mask_test]['label'].values)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

dataloader_test = DataLoader(
    dataset_test, 
    sampler=SequentialSampler(dataset_test), 
    batch_size=batch_size
)

In [22]:
import torch.utils.data as data_utils

indices = torch.arange(10)
tr_10 = data_utils.Subset(dataloader_test, indices)

## sample predictions

In [150]:
loss_test_total = 0
predictions, true_tests = [], []

n = 0
for batch in dataloader_test:
    # just a single batch
    if n < 1:
        batch = tuple(b.to(device) for b in batch)
        inputs = input_dict(batch)

        print('making output')
        with torch.no_grad():        
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_test_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_tests.append(label_ids)
        print('finished one batch')
        n += 1
    else:
        break

starting
making output
finished one batch


In [143]:
pd.set_option('display.max_colwidth', None)
X[mask_test][:4]

Unnamed: 0,author,recip_primary,subject,body,label,data_type
31085,chris.germany,marde.driscoll,Re: Question,"I luv trucks. I was injured on Sunday, I use crutches now.\n\n\nTo: Chris Germany/HOU/ECT@ECT\ncc: \nSubject: Question\n\nHey, I'm going to have to start car shopping. I'm really thinking about a \ntruck. Are you interested in helping me look? I'm asking because I know how \nmuch you like to look at trucks! md",2,test
62600,john.lavorato,stanley.horton,RE:,Stan\n\nI met with El Paso yesterday. They seem very interested.\n\nThanks,30,test
77954,kay.mann,jeffrey.hodge,Gas supply for VEPCO project,"Hi Jeff,\n\nOzzie Pagan's team is working on a project for VEPCO. The power plant will \nbe located in Edgecombe County, North Carolina. I've been told that getting \ngas for the plant is a significant issue due to local gas supply conditions. \nOne alternative has been to buy gas from the City of Rocky Mount, which has a \ntake or pay contract with the local supplier, and is not using all of their \ngas, especially in the summer. \n\nWe have developed a preliminary draft of the LOI, which I am attaching for \nyour review. I would appreciate the opportunity to discuss it with you to \nget your input, etc. Of particular concern is that Rocky Mount wants ENA to \ndeal exclusively with them during the term of the LOI (this isn't reflected \nin the draft).\n\nThanks,\n\nKay",0,test
88012,debra.perlingiere,tammi.depaolis,Re: Lunch,"Thanks Monday is good for me.\n\n\nDebra Perlingiere\nEnron North America Corp.\nLegal Department\n1400 Smith Street, EB 3885\nHouston, Texas 77002\ndperlin@enron.com\nPhone 713-853-7658\nFax 713-646-3490",10,test


In [139]:
true_pred = label_ids
author_pred = [p.argmax() for p in predictions[0]]
label_dict_inverse = {v: k for k, v in label_dict.items()}

print(f'true authors     : {[label_dict_inverse[x] for x in true_pred]}')
print(f'predicted authors: {[label_dict_inverse[x] for x in author_pred]}')

true authors     : ['chris.germany', 'john.lavorato', 'kay.mann', 'debra.perlingiere']
predicted authors: ['chris.germany', 'chris.germany', 'kay.mann', 'debra.perlingiere']


In [None]:
# so much more work to do to make this better...