In [1]:
from transformers import BertTokenizer, BertModel
import torch

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
model.eval();

In [3]:
def get_sentences(year, targets):
    import pandas as pd
    df = pd.read_csv('../articles_raw_data/news_' + str(year) + '.csv')
    df = df['text']

    bag = [item for sentence in df for item in sentence.split('.') if item != '']
    bag_size = len(bag)

    sentences = []


    for i in targets:
        count = 0
        for j in range(bag_size):

            if count < 500:
                sentence = bag[j].split()

                if len(sentence) > 512:
                    sentence = sentence[:512]


                if i in sentence:
                    sentences.append(bag[j])
                    count += 1
                else:
                    continue
            else:
                break

    
    return sentences


In [4]:
def get_targets():
    words_file = open('../data/target_words/polysemous.txt', 'r')
    targets = words_file.read().split('\n')

    return targets

def get_sentences(year, targets):
    import pandas as pd
    df = pd.read_csv('../articles_raw_data/news_' + str(year) + '.csv')
    df = df['text']

    bag = [item for sentence in df for item in sentence.split('.') if item != '']
    bag_size = len(bag)

    sentences = []


    for i in targets:
        count = 0

        for j in range(bag_size):

            if count < 1000:
                sentence = bag[j].split()

                if len(sentence) > 512:
                    sentence = sentence[:512]


                if i in sentence:
                    sentences.append(bag[j])
                    count += 1
                else:
                    continue
            else:
                break

    
    return sentences


def infer_vector(doc:str):

    marked_text = "[CLS] " + doc + " [SEP]"
    tokens = bert_tokenizer.tokenize(marked_text)[:512]
    idx = bert_tokenizer.convert_tokens_to_ids(tokens)
    segment_id = [1] * len(tokens)


    tokens_tensor = torch.tensor([idx])
    segments_tensors = torch.tensor([segment_id])

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    hidden_states = hidden_states

    return hidden_states[-2][0], tokens


def get_embed(sentences, targets):


    results = {k: {'word': k, 'sentence_number_index': [] , 'embeddings': []} for k in targets}

    for i in range(len(sentences)): #len(sentences)
        if i%1000 == 0:
            print(i)

        sentence = sentences[i].split()[:250]

        for word in targets:

            if word in sentence:
                embeddings, tokens = infer_vector(sentences[i])

                if word in tokens:
                    index = tokens.index(word)
                    embedding = embeddings[index].tolist()

                    results[word]['sentence_number_index'].append([i, index])
                    results[word]['embeddings'].append(embedding)
                
                else:
                    continue
            
            else:
                continue
    
    return results


In [5]:
def get_all(year):
    print('getting targets ..............................')
    targets = get_targets()
    print('getting sentences ............................')
    sentences = get_sentences(year= year, targets= targets)

    print('saving sentences..............................')
    file = open('../articles_raw_data/' + str(year) + '_sentences.txt','w') 
    
    for item in sentences:
        file.write(item+"\n")
    file.close()
    print('getting embeddings for setences ..............')
    results = get_embed(
        sentences= sentences,
        targets= targets
    )
    print('got embeddings  ..............................')
    file = []

    for word in targets:
        file.append(results[word])
    
    print('saving embeddings ............................')
    import json
    with open('../embeddings/embeddings_' + str(year) + '.json', "w") as final:
        json.dump(file, final, indent= 4)
    
    print(year, 'done ............................')
    


In [6]:
periods = [1980, 1982, 1985, 1987, 1989, 1990, 1992, 1995, 2000, 2001, 2002, 2003, 2005, 2008, 2009, 2010, 2012, 2013, 2015, 2016, 2017, 2018, 2019]

for period in periods:
    print(period)
    get_all(period)

1980
getting targets ..............................
getting sentences ............................
saving sentences..............................
getting embeddings for setences ..............
0


# Old Code:
Please ignore

In [23]:
df = pd.read_csv('../articles_raw_data/news_' + str(year) + '.csv')

In [24]:
df = df['text']

In [25]:
bag = [item for sentence in df for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [26]:
bag_size

660027

In [6]:
words_file = open('../data/target_words/polysemous.txt', 'r')
targets = words_file.read().split('\n')
targets

['abandon',
 'abandoned',
 'ability',
 'able',
 'about',
 'above',
 'abroad',
 'abruptly',
 'absent',
 'absolute',
 'absolutely',
 'absorb',
 'abstract',
 'absurd',
 'abundance',
 'abuse',
 'academic',
 'academy',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'access',
 'accident',
 'accommodate',
 'accompany',
 'accord',
 'accordingly',
 'account',
 'accurate',
 'achievement',
 'acid',
 'acknowledge',
 'acquaintance',
 'acquire',
 'across',
 'act',
 'acting',
 'action',
 'active',
 'actively',
 'activity',
 'actor',
 'actual',
 'actually',
 'acute',
 'add',
 'addition',
 'address',
 'adjust',
 'administration',
 'administrator',
 'admiral',
 'admission',
 'admit',
 'adopt',
 'adult',
 'advance',
 'advantage',
 'adventure',
 'advice',
 'advocate',
 'affair',
 'affect',
 'affected',
 'affection',
 'afford',
 'after',
 'afternoon',
 'age',
 'aged',
 'agency',
 'agent',
 'agree',
 'agricultural',
 'ahead',
 'aid',
 'aim',
 'air',
 'airline',
 'alarm',
 'alert',
 'alien',
 'alike',


In [7]:
len(targets)

3220

In [8]:
sentences = []


for i in targets:
    count = 0
    print(i)
    for j in range(bag_size):

        if count < 1000:
            sentence = bag[j].split()

            if len(sentence) > 512:
                sentence = sentence[:512]


            if i in sentence:
                sentences.append(bag[j])
                count += 1
            else:
                continue
        else:
            break
    
    

print(len(sentences))

abandon


NameError: name 'bag_size' is not defined

In [30]:
file = open('../articles_raw_data/' + str(year) + '_sentences.txt','w') 

for item in sentences:
	file.write(item+"\n")
file.close()

In [31]:
def infer_vector(doc:str):

    marked_text = "[CLS] " + doc + " [SEP]"
    tokens = bert_tokenizer.tokenize(marked_text)[:512]
    idx = bert_tokenizer.convert_tokens_to_ids(tokens)
    segment_id = [1] * len(tokens)


    tokens_tensor = torch.tensor([idx])
    segments_tensors = torch.tensor([segment_id])

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    hidden_states = hidden_states

    return hidden_states[-2][0], tokens

In [33]:
results = {k: {'word': k, 'sentence_number_index': [] , 'embeddings': []} for k in targets}

for i in range(len(sentences)): #len(sentences)
    if i%1000 == 0:
        print(i)

    sentence = sentences[i].split()[:250]

    for word in targets:

        if word in sentence:
            embeddings, tokens = infer_vector(sentences[i])

            index = tokens.index(word)
            embedding = embeddings[index].tolist()

            results[word]['sentence_number_index'].append([i, index])
            results[word]['embeddings'].append(embedding)
        
        else:
            continue




0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000


In [34]:
file = []

for word in targets:
    results[word]['word'] = word
    file.append(results[word])

In [None]:
import json

with open('../embeddings/embeddings_' + str(year) + '.json', "w") as final:
    json.dump(file, final, indent= 4)

In [36]:
print('../embeddings/embeddings_' + str(year) + '.json')

../embeddings/embeddings_2015.json


# Done