In [30]:
from transformers import BertTokenizer, BertModel
import torch

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [32]:
import pandas as pd

In [33]:
df = pd.read_csv('../articles_raw_data/news_1980.csv')

In [34]:
df = df['text']

In [35]:
bag = [item for sentence in df for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [36]:
bag_size

1426406

In [37]:
targets = ['state', 'right', 'around', 'black', 'force', 'interest', 'support', 'charge', 'please']

In [38]:
len(targets)

9

In [39]:
sentences = []


for i in targets:
    print(i)
    for j in range(bag_size):
        sentence = bag[j].split()

        if len(sentence) > 512:
            sentence = sentence[:512]


        if i in sentence:
            sentences.append(bag[j])
        else:
            continue
print(len(sentences))

state
right
around
black
force
interest
support
charge
please
92096


In [40]:
file = open('../articles_raw_data/1980_sentences.txt','w')

for item in sentences:
	file.write(item+"\n")
file.close()

In [41]:
def infer_vector(doc:str):

    marked_text = "[CLS] " + doc + " [SEP]"
    tokens = bert_tokenizer.tokenize(marked_text)[:512]
    idx = bert_tokenizer.convert_tokens_to_ids(tokens)
    segment_id = [1] * len(tokens)


    tokens_tensor = torch.tensor([idx])
    segments_tensors = torch.tensor([segment_id])

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    hidden_states = hidden_states

    return hidden_states[-2][0], tokens

In [42]:
len(sentences)

92096

In [45]:
results = {k: {'sentence_number_index': [] , 'embeddings': []} for k in targets}

for i in range(len(sentences)): #len(sentences)
    if i%10000 == 0:
        print(i)

    sentence = sentences[i].split()

    for word in targets:

        if word in sentence:
            embeddings, tokens = infer_vector(sentences[i])

            index = tokens.index(word)
            embedding = embeddings[index].tolist()

            results[word]['sentence_number_index'].append([i, index])
            results[word]['embeddings'].append(embedding)




0
10000
20000
30000
40000


In [None]:
import json

with open('../embeddings/embeddings_1990.json', 'w') as fp:
    json.dump(results, fp)