In [1]:
import torch
import numpy as np
import pandas as pd
import sys
from mangoes.modeling import PretrainedTransformerModelForFeatureExtraction as PreTrained

In [2]:
bert = PreTrained.load("bert-base-uncased", "bert-base-uncased")
print("[INFO] successfully constructed ELMo embedder")

[INFO] successfully constructed ELMo embedder


In [3]:
print('Device : ',torch.cuda.get_device_name(0))

Device :  GeForce GTX 1060 6GB


In [4]:
text = 'I read an article last week : turns out it was a fake article !'

In [5]:
outputs = bert.generate_outputs(text, output_hidden_states=True, output_attentions=False, word_embeddings=True)

In [6]:
outputs.keys()

dict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'offset_mappings'])

In [7]:
outputs['hidden_states'][-1].shape

torch.Size([1, 15, 768])

In [8]:
ind = 0
tensors = []
for tensor in outputs['hidden_states'][-4:]:
    tensors.append(tensor[0,ind])
avg = torch.mean(torch.stack(tensors),dim=0)

In [9]:
def recleaner(noun):
    '''
    For a given noun, clean his 'not fake' file to make length matching with locations file
    '''
    print('Start for {}'.format(noun))
    errors = 0
    with open('cleaned_data/not_fake_'+noun+'.txt','r',encoding='utf-8') as dataTxt , \
         open('locationsFiles/loc_not_fake_'+noun+'.txt','r',encoding='utf-8') as locTxt :
        data = dataTxt.read().split('\n')
        locations = locTxt.read().split('\n')
    data.pop(-1) #get rid of '' at the end
    locations.pop(-1) #get rid of '' at the end
    locList = []
    for loc in locations: #several occurences in the same sentences
        loc = loc.split('|')
        loc = [int(x) for x in loc]
        locList.append(loc)
    #cleaning
    N = len(data)
    queue = data.copy()
    keep = []
    i=0
    while queue:
        success = True
        sent = queue.pop(0)
        loc = locList[i]
        for n in loc:
            if n>=len(sent.split()):
                success = False
            else:
                if sent.split()[n] != noun:
                    success = False  
        if success:
            keep.append(sent)
            i+=1
        else:
            errors+=1
    #Put in file
    if len(locList)==len(keep):
        with open('cleaned_data/not_fake_'+noun+'_recleaned.txt','w',encoding='utf-8') as dataTxt:
            while keep:
                sentence = keep.pop(0)
                dataTxt.write(sentence+'\n')
    else:
        raise ValueError('Unknow error for '+noun)
    print('Finished for {} with {} deleted sentences'.format(noun,str(errors)))

In [14]:
def embeddings(noun):
    #============================Embeddings of FAKE and (fake) NOUN ============================
    #File opening -------------------------------------------------------------------
    print("[START] starting work on " + noun + "...")
    with open('cleaned_data/fake_'+noun+'.txt','r',encoding='utf-8') as dataTxt , \
         open('locationsFiles/loc_fake_'+noun+'.txt','r',encoding='utf-8') as locTxt :
        data = dataTxt.read().split('\n')
        locations = locTxt.read().split('\n')
    #Data preparation -------------------------------------------------------------------
    data.pop(-1) #get rid of '' at the end
    locations.pop(-1) #get rid of '' at the end
    locList = []
    for loc in locations: #several occurences in the same sentences
        loc = loc.split('|')
        loc = [int(x) for x in loc]
        locList.append(loc)
    if len(locList)!=len(data):
        print("[ERROR] non matching files for (fake)" + noun + "...")
        return
    #Sampling
    N = len(data)
    sample = np.random.choice(N,N//10)
    print("[FLW] data for (fake) "+noun+" successfully prepared. ")
    #Embeddings creation -------------------------------------------------------------------
    fakes = []
    nouns = []
    N = len(data)
    fails = 0
    
    for i in range(N):
        success = True
        sent = data[i]
        loc = locList[i]
        if len(sent.split())>512: #too long sequence
            fails+=len(loc)
        else:
            try:
                hidden_states = bert.generate_outputs(sent, output_hidden_states=True, output_attentions=False, word_embeddings=True)['hidden_states'][-4:]
            except RuntimeError:
                success= False
                fails+=len(loc)
            if success:
                for n in loc:
                    tensors = []
                    for tensor in hidden_states:
                        tensors.append(tensor[0,n])
                    fakeEmb = torch.mean(torch.stack(tensors),dim=0).numpy()
                    tensors = []
                    for tensor in hidden_states:
                        tensors.append(tensor[0,n])
                    nounEmb = torch.mean(torch.stack(tensors),dim=0).numpy()

                    fakes.append(fakeEmb)
                    nouns.append(nounEmb)
        print("\r[FLW] Embeddings construction (FAKE and fake NOUN) for {} : {}%".format(noun,np.round((i+1)/N*100),4), end="")
        sys.stdout.flush()
        
    print("\r[FLW] Embeddings (FAKE and fake NOUN) constructed for {}  (lost embeddings : {})    ".format(noun,fails))
    
    with open('BERTfiles/FAKE'+ noun+'FAKE.npy','wb') as f:
        np.save(file=f,arr=fakes)
    with open('BERTfiles/'+noun+'FAKE.npy','wb') as f:
        np.save(file=f,arr=nouns)
        
    
    #============================ Embeddings of (not fake) NOUN ============================
    
    #File opening -------------------------------------------------------------------
    with open('cleaned_data/not_fake_'+noun+'_recleaned.txt','r',encoding='utf-8') as dataTxt , \
         open('locationsFiles/loc_not_fake_'+noun+'.txt','r',encoding='utf-8') as locTxt :
        data = dataTxt.read().split('\n')
        locations = locTxt.read().split('\n')
    #Data preparation -------------------------------------------------------------------
    data.pop(-1) #get rid of '' at the end
    locations.pop(-1) #get rid of '' at the end
    locList = []
    for loc in locations: #several occurences in the same sentences
        loc = loc.split('|')
        loc = [int(x) for x in loc]
        locList.append(loc)
    if len(locList)!=len(data):
        print("[ERROR] non matching files for (fake)" + noun + "...")
        return
    #Sampling
    N = len(data)
    sample = np.random.choice(N,N//10)
    print("[FLW] data for (not fake) "+noun+" successfully prepared. ")
    
    nouns = []
    N = len(data)
    sample = np.random.choice(N,N//10)
    fails = 0
    
    for j in range(len(sample)):
        i = sample[j]
        success = True
        sent = data[i]
        loc = locList[i]
        if len(sent.split())>512: #too long sequence
            fails+=len(loc)
        else:
            try:
                hidden_states = bert.generate_outputs(sent, output_hidden_states=True, output_attentions=False, word_embeddings=True)['hidden_states'][-4:]
            except RuntimeError:
                success= False
                fails+=1
            if success:
                for n in loc:
                    tensors = []
                    if n>hidden_states[-1].shape[1]:
                        print('[ALERT ERROR] with')
                        print(sent)
                        print(loc,n)
                        print(hidden_states[-1].shape)
                        raise IndexError('Something wrong happened')
                    for tensor in hidden_states:
                        tensors.append(tensor[0,n])
                    nounEmb = torch.mean(torch.stack(tensors),dim=0).numpy()
                    nouns.append(nounEmb)
        print("\r[FLW] Embeddings construction (not fake NOUN) for {} : {}%".format(noun,np.round((j+1)/(N//10)*100),4), end="")
        sys.stdout.flush()
        
    print("\r[FLW] Embeddings (not fake NOUN) constructed for {}  (lost embeddings : {})    ".format(noun,fails))
    
    with open('BERTfiles/'+noun+'.npy','wb') as f:
        np.save(file=f,arr=nouns)
    print("[SUCCESS] successfully created embeddings for " + noun +"\n")

In [11]:
listNouns = ['article','beard','blood','company','death','gun','id','interview','passport']

In [12]:
for noun in listNouns:
    recleaner(noun)

Start for article
Finished for article with 567 deleted sentences
Start for beard
Finished for beard with 0 deleted sentences
Start for blood
Finished for blood with 0 deleted sentences
Start for company
Finished for company with 10 deleted sentences
Start for death
Finished for death with 2 deleted sentences
Start for gun
Finished for gun with 0 deleted sentences
Start for id
Finished for id with 0 deleted sentences
Start for interview
Finished for interview with 0 deleted sentences
Start for passport
Finished for passport with 0 deleted sentences


In [15]:
for noun in listNouns:
    embeddings(noun)

[START] starting work on article...
[FLW] data for (fake) article successfully prepared. 
[FLW] Embeddings (FAKE and fake NOUN) constructed for article  (lost embeddings : 0)    
[FLW] data for (not fake) article successfully prepared. 
[FLW] Embeddings (not fake NOUN) constructed for article  (lost embeddings : 28)    
[SUCCESS] successfully created embeddings for article

[START] starting work on beard...
[FLW] data for (fake) beard successfully prepared. 
[FLW] Embeddings (FAKE and fake NOUN) constructed for beard  (lost embeddings : 0)    
[FLW] data for (not fake) beard successfully prepared. 
[FLW] Embeddings (not fake NOUN) constructed for beard  (lost embeddings : 0)    
[SUCCESS] successfully created embeddings for beard

[START] starting work on blood...
[FLW] data for (fake) blood successfully prepared. 
[FLW] Embeddings (FAKE and fake NOUN) constructed for blood  (lost embeddings : 0)    
[FLW] data for (not fake) blood successfully prepared. 
[FLW] Embeddings (not fake NOU