In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from transformers import BertModel, BertTokenizer, T5Model, T5Tokenizer
import torch
import itertools
from gensim.corpora.dictionary import Dictionary

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_parquet('data/combined_clean.parquet')

In [4]:
filter_vec = CountVectorizer(min_df=0.01, max_df=0.9).fit(df['clean_text'])

In [5]:
## save down the documents, filtered for words in the cv list
def filter_text(text):
    return ' '.join([x for x in text.split() if x in filter_vec.vocabulary_])

df['filtered_text'] = df['clean_text'].apply(filter_text)

In [6]:
vectorizer = CountVectorizer().fit(df['filtered_text'])
counts = vectorizer.transform(df['clean_text'])
counts.shape[1]

4938

In [7]:
train_tokens = []
train_counts = []
for c in tqdm(counts):
    train_tokens.append(np.nonzero(c)[1])
    train_counts.append(np.squeeze(np.array(c[c > 0])))
    
train_tokens = np.array(train_tokens, dtype='object')
train_counts = np.array(train_counts, dtype='object')

1650it [00:00, 4011.55it/s]


In [8]:
df.loc[df['date'] <= '2006-01-31', 'slice'] = 0  # Greenspan
df.loc[('2006-02-01' <= df['date']) & (df['date'] <= '2014-01-31'), 'slice'] = 1  # Bernanke
df.loc[('2014-02-03' <= df['date']) & (df['date'] < '2018-02-03'), 'slice'] = 2  # Yellen
df.loc['2018-02-05' <= df['date'], 'slice'] = 3  # Powell

df['slice'] = df['slice'].astype(int)

train_times = df['slice'].values

In [9]:
df.to_parquet('data/combined_clean.parquet')

In [9]:
df.groupby('slice').date.count()

slice
0    716
1    546
2    201
3    187
Name: date, dtype: int64

In [10]:
vocab = list(vectorizer.vocabulary_.keys())

## prep embeddings

### try to take avg token-level embedding across all of the text

### bert

In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
df['len'] = df['filtered_text'].apply(lambda x: len(x.split()))

In [13]:
sample = df.loc[df['len'] > 500].sort_values('len').iloc[:200]['filtered_text'].tolist()
len(set(' '.join(sample).split()))

4394

In [14]:
## split them all to be no more than 400 words
short_sample = []
for x in sample:
    if len(x) > 400:
        for i in range(len(x) // 400 + 1):
            short_sample.append(x[i*400:(i+1)*400])
    else:
        short_sample.append(x)
len(short_sample)

2805

In [15]:
unique_words = set(' '.join(short_sample).split())
full_tokens = {}
for word in unique_words:
    full_tokens[word] = tokenizer.encode(word)[1:-1]

In [16]:
unique_tokens = set(itertools.chain.from_iterable(full_tokens.values()))
len(unique_tokens)

5126

In [17]:
embs = dict(zip(unique_tokens, [torch.zeros(768)]*len(unique_tokens)))
for text in tqdm(short_sample):
    tokens = tokenizer.encode(text)  # get list of all tokens in the sentence    
    emb = model(torch.tensor(tokens).unsqueeze(0)).last_hidden_state.squeeze(axis=0)
    for i, tok in enumerate(tokens[1:-1]):
        embs[tok] += emb[i]

100%|██████████| 2805/2805 [24:05<00:00,  1.94it/s]


In [19]:
# get the counts for each token
token_counts = dict(zip(unique_tokens, [0]*len(unique_tokens)))
for text in tqdm(short_sample):
    tokens = tokenizer.encode(text)[1:-1]
    for tok in tokens:
        token_counts[tok] += 1

100%|██████████| 2805/2805 [00:02<00:00, 1019.13it/s]


In [20]:
avg_embs = {}
for tok in tqdm(embs):
    avg_embs[tok] = embs[tok] / token_counts[tok]

100%|██████████| 5126/5126 [00:00<00:00, 137377.57it/s]


In [25]:
tokenizer.encode('interest')

[101, 3037, 102]

In [29]:
torch.stack([avg_embs[4610], avg_embs[3037]]).mean(axis=0).squeeze().shape

torch.Size([768])

In [30]:
word_embeddings = {}
for word in tqdm(full_tokens):  # see all the words
    tmp = []
    for tok in full_tokens[word]:  # see all the tokens in the word
        tmp.append(avg_embs[tok])
    word_embeddings[word] = torch.stack(tmp).mean(axis=0).squeeze()  # take the avg

100%|██████████| 6156/6156 [00:00<00:00, 41177.09it/s]


In [34]:
embeddings = np.zeros((len(vectorizer.vocabulary_), 768))

for w in tqdm(vectorizer.vocabulary_):
    if w in word_embeddings:
        idx = vectorizer.vocabulary_[w]
        embeddings[idx] = word_embeddings[w].detach().numpy()

100%|██████████| 4938/4938 [00:00<00:00, 160617.56it/s]


In [43]:
embeddings[embeddings.sum(axis=1) == 0].shape

(545, 768)

In [44]:
## save down everything
np.savez_compressed(
    'test_data_bert_context.npz', 
    train_tokens=train_tokens,
    train_counts=train_counts,
    train_times=train_times,
    vocab=np.array(vocab),
    embeddings=embeddings
)