In [1]:
from transformers import BertTokenizer, BertModel
import torch

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
model.eval();

In [3]:
import pandas as pd

In [4]:
year = 1995

In [5]:
df = pd.read_csv('../articles_raw_data/news_' + str(year) + '.csv')

In [6]:
df = df['text']

In [7]:
bag = [item for sentence in df for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [8]:
bag_size

1519311

In [9]:
targets = ['state', 'right', 'around', 'black', 'force', 'interest', 'support', 'charge', 'please']

In [10]:
len(targets)

9

In [11]:
sentences = []


for i in targets:
    print(i)
    for j in range(bag_size):
        sentence = bag[j].split()

        if len(sentence) > 512:
            sentence = sentence[:512]


        if i in sentence:
            sentences.append(bag[j])
        else:
            continue
print(len(sentences))

state
right
around
black
force
interest
support
charge
please
91203


In [12]:
file = open('../articles_raw_data/' + str(year) + '_sentences.txt','w') 

for item in sentences:
	file.write(item+"\n")
file.close()

In [13]:
def infer_vector(doc:str):

    marked_text = "[CLS] " + doc + " [SEP]"
    tokens = bert_tokenizer.tokenize(marked_text)[:512]
    idx = bert_tokenizer.convert_tokens_to_ids(tokens)
    segment_id = [1] * len(tokens)


    tokens_tensor = torch.tensor([idx])
    segments_tensors = torch.tensor([segment_id])

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    hidden_states = hidden_states

    return hidden_states[-2][0], tokens

In [14]:
len(sentences)

91203

In [15]:
results = {k: {'sentence_number_index': [] , 'embeddings': []} for k in targets}

for i in range(len(sentences)): #len(sentences)
    if i%10000 == 0:
        print(i)

    sentence = sentences[i].split()[:512]

    for word in targets:

        if word in sentence:
            embeddings, tokens = infer_vector(sentences[i])

            index = tokens.index(word)
            embedding = embeddings[index].tolist()

            results[word]['sentence_number_index'].append([i, index])
            results[word]['embeddings'].append(embedding)
        
        else:
            continue




0
10000
20000
30000
40000


ValueError: 'right' is not in list

In [17]:
print(tokens)

['[CLS]', "'", 's', 'green', 'onions', 'boston', 'more', 'than', 'feeling', 'david', 'bowie', 'fame', 'david', 'bowie', 'space', 'odd', '##ity', 'david', 'bowie', 'z', '##ig', '##gy', 'star', '##dus', 'the', 'box', 'tops', 'the', 'letter', 'charles', 'brown', 'drift', '##in', "'", 'blues', 'james', 'brown', 'got', 'you', 'i', 'feel', 'good', 'james', 'brown', 'please', 'please', 'please', 'james', 'brown', 'say', 'it', 'loud', 'i', "'", 'm', 'black', 'and', 'i', "'", 'm', 'proud', 'ruth', 'brown', 'mama', 'he', 'treats', 'your', 'daughter', 'mean', 'jackson', 'browne', 'late', 'for', 'the', 'sky', 'buffalo', 'springfield', 'for', 'what', 'it', "'", 's', 'worth', 'solomon', 'burke', 'everybody', 'needs', 'somebody', 'to', 'love', 'johnny', 'burnett', '##e', 'trio', 'train', 'kept', 'a', 'roll', '##in', "'", 'the', 'byrd', '##s', 'eight', 'miles', 'high', 'the', 'byrd', '##s', 'hickory', 'wind', 'the', 'byrd', '##s', 'mr', 'tam', '##bour', '##ine', 'man', 'johnny', 'cash', 'f', '##ols', 

In [20]:
len(sentences[i].split())

906

In [16]:
import json

with open('../embeddings/embeddings_' + str(year) + '.json', 'w') as fp:
    json.dump(results, fp)

In [17]:
print('../embeddings/embeddings_' + str(year) + '.json')

../embeddings/embeddings_1990.json
