In [1]:
from transformers import BertTokenizer, BertModel
import torch

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [3]:
import pandas as pd

In [5]:
df = pd.read_csv('../data/news_1980.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/news_1980.csv'

In [5]:
df = df['text']

In [6]:
df

0        william simon  former secretary of the treasur...
1        voters in five states carter and kennedy to me...
2        lieut. gen. chon too hwan  the army strongman ...
3        major increase in new york city subway and bus...
4        air service possible by fall by bernard gwertz...
                               ...                        
45582    the larry bird show played madison square gard...
45583    bob mackinnon had several reasons to expect hi...
45584    brian buckley of harvard and rick casko of dra...
45585    st john's will seek its second straight holida...
45586    returns are in on the eclipse awards for the b...
Name: text, Length: 45587, dtype: object

In [7]:
bag = [item for sentence in df for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [8]:
bag

["william simon  former secretary of the treasury and now policy chairman of ronald reagan's presidential campaign  has volunteered to organize republican citizens for koch committee",
 ' it would aid an expected re election bid by mayor koch in ',
 ' officials aware of the idea said yesterday that the committee would also attempt to help mr koch get the endorsement of the republican party in new york city',
 ' the mayor and his political advisers have expressed interest in getting him on both the republican and democratic party lines',
 'mr',
 ' simon confirmed the reports yesterday  saying that the idea had come up some months ago and that he had made the offer to mr koch',
 ' he said the mayor accepted gladly',
 " he's the best thing to happen to this city since fiorello la guardia  mr simon said",
 ' spokesman for the mayor confirmed that mr simon had in the past talked with mr koch about republican committee to back him  and that mr koch had been receptive',
 " but the mayor's adv

In [9]:
bag_size

1426406

In [10]:
filename = 'target words/target.txt'

with open(filename) as file:
    targets = [line.rstrip() for line in file]

In [11]:
len(targets)

137

In [17]:
sentences = []


for i in targets:
    print(i)
    for j in range(bag_size):
        sentence = bag[j].split()

        if len(sentence) > 512:
            sentence = sentence[:512]


        if i in sentence:
            sentences.append(bag[j])
        else:
            continue
print(len(sentences))

state
people
company
percent
million
american
president
united
official
school
government
house
street
group
right
point
since
world
second


KeyboardInterrupt: 

In [18]:
file = open('data/1980_sentences.txt','w')

for item in sentences:
	file.write(item+"\n")
file.close()

In [14]:
def infer_vector(doc:str):

    marked_text = "[CLS] " + doc + " [SEP]"
    tokens = bert_tokenizer.tokenize(marked_text)[:512]
    idx = bert_tokenizer.convert_tokens_to_ids(tokens)
    segment_id = [1] * len(tokens)


    tokens_tensor = torch.tensor([idx])
    segments_tensors = torch.tensor([segment_id])

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    hidden_states = hidden_states

    return hidden_states[-2][0], tokens

In [16]:
sentences

[' mr koch ha publicly opposed democratic party leaders by endorsing and helping to raise funds for the re election two years ago of state senator john marchi  the staten island republican  and this year again he endorsed the senator for re election',
 ' four percent did not state their intentions',
 "major increase in new york city subway and bus fares became virtual certainty tonight when the majority leader of the republican controlled state senate  warren anderson  told governor carey that he would not accept any of the governor's plan to subsidize the metropolitan transportation authority's deficit",
 "'s financing from existing state revenues and forgiving   million the authority owes the state",
 ' million for the current state fiscal year',
 ' mr caemmerer said the republican plan would involve instead the enrichment of the formula that provides state funds for the m',
 ' among the steps contemplated by state department officials is the opening of chinese consulates in new york

In [15]:
for i in range(len(sentences)):
    embedding = infer_vector(sentences[i])
print(embedding.size())

KeyboardInterrupt: 