# Text preprocessing

## Google's BERT

In [1]:
import pandas as pd
import pickle
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from tqdm import tqdm_notebook as tqdm

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
# Model imports
cuda = torch.device('cuda')
bert = BertModel.from_pretrained('bert-base-uncased').to(cuda)
bert.eval()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
infos = pickle.load(open('../data/infos.p', 'rb'))

In [4]:
infos[1]

{'adult': False,
 'collection': 10194,
 'genres': [16, 35, 10751],
 'original_language': 'en',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'popularity': 25.398,
 'production_companies': [3],
 'production_countries': ['us'],
 'release_year': 1995,
 'release_month': 10,
 'revenue': 373554033,
 'runtime': 81,
 'spoken_languages': ['en'],
 'tagline': '',
 'title': 'Toy Story',
 'vote_average': 7.9,
 'vote_count': 9787}

In [5]:
infos_tensor = {}
for k,v in tqdm(infos.items()):
    v = (v['overview'] + v['tagline'] + v['title'])[:512]
    v = tokenizer.tokenize(v)
    v = tokenizer.convert_tokens_to_ids(v)
    v = torch.tensor(v).to(cuda)
    infos_tensor[k] = v

HBox(children=(IntProgress(value=0, max=27278), HTML(value='')))




In [6]:
from collections import OrderedDict
infos_sorted = OrderedDict(sorted(infos_tensor.items(), key=lambda t: t[1].size()))

In [7]:
infos_bert = {}
batch = []
indexes = []
max_size = 0
batch_size = 10

for i in tqdm(range(len(infos_sorted))):
    idx, tensor = infos_sorted.popitem()
    batch.append(tensor)
    indexes.append(idx)
    
    if len(batch) >= batch_size:
        seq_lengths = torch.tensor([len(seq) for seq in batch]).long().cuda()
        seq_tensor = torch.zeros((len(batch), seq_lengths.max())).long().cuda()
        
        for idx, (seq, seqlen) in enumerate(zip(batch, seq_lengths)):
            seq_tensor[idx, :seqlen] = torch.tensor(seq).long().cuda()
            
        _, output = bert(seq_tensor)

        output = output.detach().cpu()
        for i in range(output.size(0)):
            infos_bert[indexes[i]] = output[i]
            
        batch = []
        indexes = []
    

HBox(children=(IntProgress(value=0, max=27278), HTML(value='')))






In [14]:
pickle.dump(dict([(i[0], i[1].numpy()) for i in infos_bert.items()]),
            open('../data/texts_bert.p', 'wb'))

## GPT 2

In [1]:
import pandas as pd
import pickle
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model
import torch
from tqdm import tqdm_notebook as tqdm

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
cuda = torch.device('cuda')
gpt2 = GPT2Model.from_pretrained('gpt2').to(cuda)
gpt2.eval()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
infos = pickle.load(open('../data/infos.p', 'rb'))

In [4]:
infos_tensor = {}
for k,v in tqdm(infos.items()):
    v = (v['overview'] + v['tagline'] + v['title'])[:512]
    v = tokenizer.encode(v)
    v = torch.tensor(v).to(cuda)
    infos_tensor[k] = v

HBox(children=(IntProgress(value=0, max=27278), HTML(value='')))




In [5]:
from collections import OrderedDict
infos_sorted = OrderedDict(sorted(infos_tensor.items(), key=lambda t: t[1].size()))

In [6]:
infos_gpt2 = {}
batch = []
indexes = []
max_size = 0
batch_size = 5

for i in tqdm(range(len(infos_sorted))):
    idx, tensor = infos_sorted.popitem()
    batch.append(tensor)
    indexes.append(idx)
    
    if len(batch) >= batch_size:
        seq_lengths = torch.tensor([len(seq) for seq in batch]).long().cuda()
        seq_tensor = torch.zeros((len(batch), seq_lengths.max())).long().cuda()
        
        for idx, (seq, seqlen) in enumerate(zip(batch, seq_lengths)):
            seq_tensor[idx, :seqlen] = torch.tensor(seq).long().cuda()
            
        output, _ = gpt2(seq_tensor)
        output = output[:,-1]
        output = output.detach().cpu()
        for i in range(output.size(0)):
            infos_gpt2[indexes[i]] = output[i]
            
        batch = []
        indexes = []

HBox(children=(IntProgress(value=0, max=27278), HTML(value='')))






In [7]:
pickle.dump(dict([(i[0], i[1].numpy()) for i in infos_gpt2.items()]),
            open('../data/texts_gpt2.p', 'wb'))