In [11]:
from google.colab import drive
drive.mount('/content/drive')
%cd 'drive/My Drive/projet-info/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/projet-info


In [1]:
# Download necessary additional libraries
!pip install unidecode
!pip install transformers



In [0]:
from multiprocessing import cpu_count

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

In [0]:
# Number of available cores for parallel computing
N_CORES = cpu_count()

In [0]:
class TokenizeForBERT(Dataset):
    """Convert corpus to tensors of token indices in a BERT model vocabulary."""
    def __init__(self, corpus, model_name='camembert-base', maxlen=None):
        self.corpus = corpus
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if maxlen == None:
            self.maxlen = 512 # Max doc length allowed by BERT models
        else:
            self.maxlen = maxlen

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, index):

        # Select instance
        sentence = self.corpus[index]

        # Preprocess data as required by BERT models
        tokens = self.tokenizer.tokenize(sentence)
        bos_token = self.tokenizer.cls_token
        eos_token = self.tokenizer.sep_token
        pad_token = self.tokenizer.pad_token
        # Insert CLS and SEP tokens at beginning and end of sentence
        tokens = [bos_token] + tokens + [eos_token]
        if len(tokens) < self.maxlen:
            # If sentence is shorter than maxlen, pad sentence using special 
            # padding token
            tokens = tokens + [pad_token for _ in range(self.maxlen - len(tokens))]
        else:
            # Cut the sentence if it is longer than maxlen
            tokens = tokens[:self.maxlen-1] + [eos_token]

        # Convert tokens to tensor of indices in BERT model vocabulary
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(tokens_ids)
        # Get attention mask to distinguish padding tokens from actual tokens
        pad_token_id = self.tokenizer.convert_tokens_to_ids(pad_token)
        attn_mask = (tokens_ids_tensor != pad_token_id).long()

        return tokens_ids_tensor, attn_mask

In [0]:
def compute_representations(corpus, model_name='camembert-base', batch_size=256,
                            max_tokens=None, gpu=True):
    """Compute representations of employment offers using a BERT model."""

    # Tokenize documents as needed for BERT models
    tokenized_corpus = TokenizeForBERT(corpus=corpus, model_name=model_name,
                                       maxlen=max_tokens)
    # Prepare data in batches (for RAM issues)
    corpus_loader = DataLoader(tokenized_corpus, batch_size=batch_size, 
                               num_workers=N_CORES)
    # Load BERT model
    model = AutoModel.from_pretrained(model_name)

    # Send model to GPU for (much) faster computations
    if gpu:
        model = model.to("cuda")

    batches_rep = []
    for it, (seq, attn_masks) in enumerate(corpus_loader):

        if gpu:
            seq, attn_masks = seq.cuda(), attn_masks.cuda()

        # Compute document representations without constructing the computing 
        # graph (only needed for backprop)
        with torch.no_grad():
            cont_reps, _ = model(seq, attention_mask=attn_masks)
        # Get representation of [CLS] head (document representation)
        cls_rep = cont_reps[:, 0]
        # Store representations of current batch as a numpy array
        batches_rep.append(cls_rep.cpu().numpy())

    return np.vstack(batches_rep)

In [6]:
# Download employment offers
!wget https://raw.githubusercontent.com/avouacr/3A-ENSAE-projet-info/master/API_test/df_max_pages.csv

--2020-04-05 17:25:15--  https://raw.githubusercontent.com/avouacr/3A-ENSAE-projet-info/master/API_test/df_max_pages.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2647301 (2.5M) [text/plain]
Saving to: ‘df_max_pages.csv.28’


2020-04-05 17:25:15 (60.8 MB/s) - ‘df_max_pages.csv.28’ saved [2647301/2647301]



In [0]:
# Import employment offers
df_offers = pd.read_csv('df_max_pages.csv')
descriptions = df_offers['description'].values

In [8]:
%%time

# Compute BERT representations of employment offers
corpus_representations = compute_representations(corpus=descriptions, 
                                                 model_name='camembert-base', 
                                                 batch_size=256, max_tokens=None,
                                                 gpu=True)

CPU times: user 49.8 s, sys: 32.1 s, total: 1min 21s
Wall time: 1min 23s


In [0]:
model = AutoModel.from_pretrained('camembert-base')

In [0]:
# Safety check
assert corpus_representations.shape[0] == len(descriptions)
assert corpus_representations.shape[1] == 768

In [0]:
# Store representations
np.savetxt('camembert_representations.txt', corpus_representations)