In [1]:
import torch
import matplotlib.pyplot as plt
from transformers import RobertaForMaskedLM, AutoTokenizer
from os import listdir
from scipy.io import savemat, loadmat
import numpy as np
import process_textGrid as tg


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


#load RoBERTa model
modelname = 'roberta-base' 

roberta= RobertaForMaskedLM.from_pretrained(modelname)
tokenizer= AutoTokenizer.from_pretrained(modelname)
roberta.eval()




RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [141]:
tokenizer.mask_token_id

50264

In [4]:
#%% main processing function, get predicted probabilities for each word
#compare the predicted vector for final token at the end of the sentence, with actual encoded vector
# sentences: list of sentences. each sentence should be a list of words
# sliding: if 0, attempts to use every previous word as context for word i. if not, use only the indicated number of previous words
def processSentence(sentences,sliding=0):
    #print(sentence)
    word_probs = []
    b=np.repeat(a,2)
    tokens = torch.tensor(tokenizer.encode(sentences)).unsqueeze(0) # get tokens from CamemBERT
    print(tokens.size())

In [148]:
word_markings

array([None, None, None, None, None, None, None, None, None, None, None,
       None, None, None], dtype=object)

In [3]:
def mask_array(sentence):
    mask = "<mask>"
    rWords = range(len(sentence))
    a=[np.array(sentence) for i in rWords]
    b=[ [a[i][j] if j<i else "" for j in rWords ] for i in rWords]
    square =  [[mask if j==i else b[i][j] for j in rWords ] for i in rWords]
    masked_array = [" ".join(square[i]).strip() for i in rWords]
    return(masked_array)


In [373]:
a=mask_array(text_markings[0:10])
tokens = tokenizer(a,padding=True)['input_ids']
tokens_tensor=torch.tensor(tokens)

In [374]:
a


['<mask>',
 'increase <mask>',
 'increase its <mask>',
 'increase its size <mask>',
 'increase its size fivefold <mask>',
 'increase its size fivefold or <mask>',
 'increase its size fivefold or tenfold <mask>',
 'increase its size fivefold or tenfold give <mask>',
 'increase its size fivefold or tenfold give it <mask>',
 'increase its size fivefold or tenfold give it strength <mask>']

In [4]:
#process_textGrid.py
textGrid_folder = "/home/perezoso/Dropbox/projects/jaco/Project ECOSud/JulesVerne/jv-pln/data/wav/revised/"
files = listdir(textGrid_folder)


filenames = np.array(list((filter(lambda x: x.endswith(".TextGrid") , files))),dtype=object)
phonemes = np.empty(filenames.shape,dtype="object")
words = np.empty(filenames.shape,dtype="object")
word_markings=np.empty(filenames.shape,dtype="object")
words=np.empty(filenames.shape,dtype="object")
phon_markings=np.empty(filenames.shape,dtype="object")
nWords=np.empty(filenames.shape,dtype="object")


sliding=10 #sliding parameter. if 0, will use ALL previous context... don't think it can handle more than 100... see CamemBERT documentation for max sentence size...


In [5]:
for i,file in enumerate(filenames):
    all_markings = tg.parseTextGrid(textGrid_folder+file)
    xmin,xmax,text_markings = zip(*all_markings[0]) #[0] text markings, [1] phoneme markings
    #probs = processSentence(text_markings,sliding)
    xminf = [float(x) for x in xmin]
    xmaxf = [float(x) for x in xmax]
    #probsf= [float(x) for x in probs]
    word_markings[i]=np.array([xminf,xmaxf,np.repeat(0,len(xminf))]).transpose()
    words[i]=np.array(text_markings)
    nWords[i] = len(text_markings)
    xmin,xmax,p_markings = zip(*all_markings[1]) #[0] text markings, [1] phoneme markings
    xminf = [float(x) for x in xmin]
    xmaxf = [float(x) for x in xmax]
    phonemes[i]=p_markings
    phon_markings[i]=np.array([xminf,xmaxf]).transpose()

In [272]:
words

array([array(['certainly', 'might', 'possess', 'such', 'a', 'destructive',
              'machine', 'and', 'in', 'these', 'disastrous', 'times', 'when',
              'the', 'ingenuity', 'of', 'man', 'has', 'multiplied', 'the',
              'power', 'of', 'weapons', 'of', 'war', 'it', 'was', 'possible',
              'that', 'without', 'the', 'knowledge', 'of', 'others', 'a',
              'state', 'to', 'work', 'such', 'a', 'formidable', 'engine', 'the',
              'idea', 'of', 'a', 'war', 'machine', 'fell', 'before', 'the',
              'declaration', 'of', 'governments', 'as', 'public', 'interest',
              'was', 'in', 'question', 'and', 'transatlantic', 'communications',
              'their', 'veracity', 'could', 'not', 'be', 'doubted', 'but', 'how',
              'admit', 'that', 'the', 'construction', 'of', 'this', 'submarine',
              'boat', 'had', 'escaped', 'the', 'public', 'eye', 'for', 'a',
              'private', 'gentleman', 'to', 'keep', 'the', 'secre

In [54]:
whole_text=[w for text in words for w in text]

In [61]:
words = [["the","dog","is","a","man's","best","friend"],["but","the","cat","is","a","woman's","."]]
sliding=7

In [62]:
nWindows = max(len(whole_text)-sliding + 1,1)

In [63]:
batch = np.empty((nWindows))
for i in range(nWindows):
    i=0
    window_text = whole_text[i:(i+sliding)]
    window_text_joined = " ".join(window_text)
    masked_window = mask_array(window_text)
    

In [64]:
masked_window

['<mask>',
 'the <mask>',
 'the dog <mask>',
 'the dog is <mask>',
 'the dog is a <mask>',
 "the dog is a man's <mask>",
 "the dog is a man's best <mask>"]

In [85]:
window_text_joined

"the dog is a man's best friend"

In [86]:
window_text


['the', 'dog', 'is', 'a', "man's", 'best', 'friend']

In [9]:
tokenizer.decode(target_tokens[mask_index[0]])

NameError: name 'target_tokens' is not defined

In [10]:
def getRobertaProbabilities(batch,truth):
    #tokenize the masked batch
    tokens = torch.tensor(tokenizer(batch,padding=True)['input_ids'])
    #tokenize the ground truth
    target_tokens = torch.tensor(tokenizer(" ".join(truth))['input_ids'])
    
    #where in each token batch is each mask?
    row,mask_index = (tokens == tokenizer.mask_token_id).nonzero(as_tuple=True)
    #get list of ground truth token values for each mask position
    target_tokens_in_mask = target_tokens[mask_index]
    
    #predict tokens
    predicted_tokens = roberta(tokens)[0]
    #keep only the prediction at mask
    predicted_vectors = predicted_tokens[row,mask_index,:]
    #apply softmax
    probs = predicted_vectors.softmax(dim=1)
    maxProb = torch.max(probs)
    probs = probs[:,target_tokens_in_mask]
    probs_norm = probs/maxProb
    return(probs)

In [94]:
#tokenize the masked batch
tokens = torch.tensor(tokenizer(batch,padding=True)['input_ids'])
#tokenize the ground truth
target_tokens = torch.tensor(tokenizer(truth)['input_ids'])

#where in each token batch is each mask?
row,mask_index = (tokens == tokenizer.mask_token_id).nonzero(as_tuple=True)
#get list of ground truth token values for each mask position
target_tokens_in_mask = target_tokens[mask_index]

#predict tokens
predicted_tokens = roberta(tokens)[0]
#keep only the prediction at mask
predicted_vectors = predicted_tokens[row,mask_index,:]
#apply softmax
probs = predicted_vectors.softmax(dim=0)
maxProb = torch.max(probs)
probs_targets = [ probs[i,j] for i,j in zip(row,target_tokens_in_mask)]
probs_norm = probs_targets/maxProb
return(probs_targets,probs_norm)


In [113]:
torch.max(probs,)


tensor(1., grad_fn=<MaxBackward1>)

In [126]:
batch = ["he dog is a man's best <mask>"]
truth = ["he dog is a man's best friend"]
tokens = torch.tensor(tokenizer(batch)['input_ids'])
#tokenize the ground truth
target_tokens = torch.tensor(tokenizer(truth)['input_ids'])

#where in each token batch is each mask?
row,mask_index = (tokens == tokenizer.mask_token_id).nonzero(as_tuple=True)
#get list of ground truth token values for each mask position
target_tokens_in_mask = target_tokens[0,mask_index]

#predict tokens
predicted_tokens = roberta(tokens)[0]
#keep only the prediction at mask
predicted_vectors = predicted_tokens[row,mask_index,:]
#apply softmax
probs = predicted_vectors.softmax(dim=1)
maxProb = torch.max(probs)
probs_targets = torch.tensor([ probs[i,j] for i,j in zip(row,target_tokens_in_mask)])
probs_norm = probs_targets/maxProb

TypeError: unsupported operand type(s) for /: 'list' and 'Tensor'

In [168]:
probs_targets = torch.tensor([ probs[i,j] for i,j in zip(row,target_tokens_in_mask)])


In [176]:
maxProb = torch.max(probs)
probs_targets = torch.tensor([ probs[i,j] for i,j in zip(row,target_tokens_in_mask)])
probs_norm = probs_targets/maxProb

In [178]:
probs_norm.item()

1.0

In [172]:
maxProb

tensor(1., grad_fn=<MaxBackward1>)

In [105]:
target_tokens_in_mask

tensor([ 627, 2335,   16,   10,  313,  275, 1441])

In [106]:
tokens


tensor([[    0, 50264,     2,     1,     1,     1,     1,     1,     1,     1],
        [    0,   627, 50264,     2,     1,     1,     1,     1,     1,     1],
        [    0,   627,  2335, 50264,     2,     1,     1,     1,     1,     1],
        [    0,   627,  2335,    16, 50264,     2,     1,     1,     1,     1],
        [    0,   627,  2335,    16,    10, 50264,     2,     1,     1,     1],
        [    0,   627,  2335,    16,    10,   313,    18, 50264,     2,     1],
        [    0,   627,  2335,    16,    10,   313,    18,   275, 50264,     2]])

In [109]:
batch


['<mask>',
 'the <mask>',
 'the dog <mask>',
 'the dog is <mask>',
 'the dog is a <mask>',
 "the dog is a man's <mask>",
 "the dog is a man's best <mask>"]

In [363]:
probs[row,]

tensor([1.4337e-08, 8.0403e-05, 7.5938e-08, 1.1414e-03, 1.0345e-04, 7.1672e-03,
        8.8760e-04, 1.0130e-02, 1.9647e-03, 5.3256e-04, 4.6949e-04, 5.3233e-03,
        6.3513e-04, 1.9747e-02, 1.8745e-04, 1.6798e-04, 2.7663e-05, 4.7144e-04,
        5.5107e-03, 8.2309e-06, 2.3819e-03, 5.1805e-04, 8.8909e-06, 3.0148e-05,
        2.3770e-05, 2.3330e-03, 1.1841e-06, 3.2732e-05, 7.9461e-06, 4.5775e-02,
        2.4344e-04, 2.6501e-05, 6.2112e-06, 5.4425e-06, 2.6775e-05, 1.2970e-05,
        1.4905e-04, 3.5645e-04, 4.5467e-06, 1.1616e-05, 2.9712e-06, 1.3558e-05,
        2.2583e-08, 3.7958e-04, 6.6369e-05, 5.7872e-07, 1.7677e-07, 2.2536e-06,
        1.6873e-06, 1.4904e-05], grad_fn=<IndexBackward0>)

In [213]:
torch.max(predicted_vectors.softmax(dim=1))

tensor(0.3129, grad_fn=<MaxBackward1>)

In [235]:
a[7]


'l'

In [21]:
' '.join(text_markings)

'certainly might possess such a destructive machine and in these disastrous times when the ingenuity of man has multiplied the power of weapons of war it was possible that without the knowledge of others a state to work such a formidable engine the idea of a war machine fell before the declaration of governments as public interest was in question and transatlantic communications their veracity could not be doubted but how admit that the construction of this submarine boat had escaped the public eye for a private gentleman to keep the secret under such circumstances would be very difficult and for a state whose every act is persistently watched by powerful rivals certainly impossible upon my arrival in new york several persons did me the honor of consulting the phenomenon in question i had published in france a work in two volumes entitled mysteries of the great submarine grounds'

In [14]:
words

array([None, None, None, None, None, None, None, None, None, None, None,
       None, None, None], dtype=object)