In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from transformers import BertModel, BertTokenizer, T5Model, T5Tokenizer
import torch
from gensim.corpora.dictionary import Dictionary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_parquet('data/combined_clean.parquet')

In [3]:
filter_vec = CountVectorizer(min_df=0.01, max_df=0.9).fit(df['clean_text'])

In [4]:
## save down the documents, filtered for words in the cv list
def filter_text(text):
    return ' '.join([x for x in text.split() if x in filter_vec.vocabulary_])

df['filtered_text'] = df['clean_text'].apply(filter_text)

In [5]:
vectorizer = CountVectorizer().fit(df['filtered_text'])
counts = vectorizer.transform(df['clean_text'])
counts.shape[1]

4938

In [6]:
train_tokens = []
train_counts = []
for c in tqdm(counts):
    train_tokens.append(np.nonzero(c)[1])
    train_counts.append(np.squeeze(np.array(c[c > 0])))
    
train_tokens = np.array(train_tokens, dtype='object')
train_counts = np.array(train_counts, dtype='object')

1650it [00:00, 4039.26it/s]


In [8]:
df.loc[df['date'] <= '2006-01-31', 'slice'] = 0  # Greenspan
df.loc[('2006-02-01' <= df['date']) & (df['date'] <= '2014-01-31'), 'slice'] = 1  # Bernanke
df.loc[('2014-02-03' <= df['date']) & (df['date'] < '2018-02-03'), 'slice'] = 2  # Yellen
df.loc['2018-02-05' <= df['date'], 'slice'] = 3  # Powell

df['slice'] = df['slice'].astype(int)

train_times = df['slice'].values

In [9]:
df.to_parquet('data/combined_clean.parquet')

In [10]:
df.groupby('slice').date.count()

slice
0    716
1    546
2    201
3    187
Name: date, dtype: int64

In [None]:
vocab = list(vectorizer.vocabulary_.keys())

## prep embeddings

### bert

In [14]:
def get_word_vec(word, tokenizer, embeddings):
    input_ids = torch.tensor(tokenizer.encode(word, add_special_tokens=False))
    return embeddings(input_ids).mean(axis=0)

In [155]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [157]:
embeddings = np.zeros((len(vocab), 768))

for w in tqdm(vocab):
    idx = vectorizer.vocabulary_[w]
    embeddings[idx] = get_word_vec(w, tokenizer, model.embeddings.word_embeddings).squeeze().detach().numpy()

100%|██████████| 4938/4938 [00:00<00:00, 5156.63it/s]


In [158]:
(embeddings.sum(axis=1) == 0).max()  # check that all were filled in

False

In [178]:
## save down everything
np.savez_compressed(
    'test_data.npz', 
    train_tokens=train_tokens,
    train_counts=train_counts,
    train_times=train_times,
    vocab=np.array(vocab),
    embeddings=embeddings
)

In [129]:
berttokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bertmodel = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [177]:
output = bertmodel(torch.tensor(berttokenizer.encode('hello')).unsqueeze(0))
output.last_hidden_state.shape

torch.Size([1, 3, 768])

In [182]:
output.last_hidden_state.mean(axis=1).squeeze().shape

torch.Size([768])

In [184]:
embeddings = np.zeros((len(vocab), 768))

for w in tqdm(vocab):
    idx = vectorizer.vocabulary_[w]
    embeddings[idx] = bertmodel(torch.tensor(berttokenizer.encode(w)).unsqueeze(0)).last_hidden_state.mean(axis=1).squeeze().detach().numpy()

  1%|          | 49/4938 [00:24<40:56,  1.99it/s]  


KeyboardInterrupt: 

In [None]:
## save down everything
np.savez_compressed(
    'test_data_bert_alt.npz', 
    train_tokens=train_tokens,
    train_counts=train_counts,
    train_times=train_times,
    vocab=np.array(vocab),
    embeddings=embeddings
)

### glove

In [7]:
## save down everything
orig = np.load('test_data.npz', allow_pickle=True)
train_tokens=orig['train_tokens']
train_counts=orig['train_counts']
train_times=orig['train_times']
vocab=orig['vocab']

In [9]:
glove_embeddings = dict()
with open('embeddings/glove.6B.300d.txt') as f:
    for line in tqdm(f.readlines()):
        w = line.split()[0].strip()
        if w in vocab:
            glove_embeddings[w] = np.array([float(x) for x in line.split()[1:]])

100%|██████████| 400000/400000 [00:09<00:00, 40875.09it/s]


In [10]:
embeddings = np.zeros((len(vocab), glove_embeddings['economy'].shape[0]))

for w in tqdm(vocab):
    idx = vectorizer.vocabulary_[w]
    if w in glove_embeddings:
        embeddings[idx] = glove_embeddings[w]

100%|██████████| 4938/4938 [00:00<00:00, 307437.85it/s]


In [11]:
embeddings.shape

(4938, 300)

In [12]:
## save down everything
np.savez_compressed(
    'test_data_glove.npz', 
    train_tokens=train_tokens,
    train_counts=train_counts,
    train_times=train_times,
    vocab=vocab,
    embeddings=embeddings
)

In [2]:
## save down everything
tmp = np.load(
    'test_data_glove.npz'
)['embeddings']

In [4]:
tmp[tmp.sum(axis=1) == 0].shape

(49, 300)

### t5

In [215]:
## save down everything
orig = np.load('test_data.npz', allow_pickle=True)
train_tokens=orig['train_tokens']
train_counts=orig['train_counts']
train_times=orig['train_times']
vocab=orig['vocab']

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5Model.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [53]:
def get_t5_word_vec(word, tokenizer, model):
    tok = tokenizer(word, return_tensors='pt', add_special_tokens=False)
    emb = model.encoder(
        input_ids=tok['input_ids'],
        attention_mask=tok['attention_mask'],
        return_dict=True
    )
    
    return emb.last_hidden_state.mean(axis=1)

In [54]:
embeddings = np.zeros((len(vocab), 768))

for w in tqdm(vocab):
    idx = vectorizer.vocabulary_[w]
    embeddings[idx] = get_t5_word_vec(w, tokenizer, model).squeeze().detach().numpy()

100%|██████████| 4938/4938 [10:32<00:00,  7.81it/s]


In [55]:
(embeddings.sum(axis=1) == 0).max()  # check that all were filled in

False

In [56]:
## save down everything
np.savez_compressed(
    'test_data_t5.npz', 
    train_tokens=train_tokens,
    train_counts=train_counts,
    train_times=train_times,
    vocab=vocab,
    embeddings=embeddings
)

In [101]:
model.get_output_embeddings()

In [159]:
tok = tokenizer('hello', return_tensors='pt', add_special_tokens=False)
emb = model.encoder(
    input_ids=tok['input_ids'],
    attention_mask=tok['attention_mask'],
    return_dict=True
)
emb.last_hidden_state.mean(axis=1).squeeze()[:10]

tensor([ 0.1429,  0.2725, -0.0193,  0.2019, -0.0759,  0.4104,  0.0384, -0.2304,
        -0.2852, -0.3313], grad_fn=<SliceBackward0>)

In [168]:
def get_t5_word_vec_alt(word, tokenizer, model):
    tok = tokenizer(word, return_tensors='pt', add_special_tokens=False)
    return model.encoder.embed_tokens(tok['input_ids']).mean(axis=1).squeeze()

In [170]:
embeddings_alt = np.zeros((len(vocab), 768))

for w in tqdm(vocab):
    idx = vectorizer.vocabulary_[w]
    embeddings_alt[idx] = get_t5_word_vec_alt(w, tokenizer, model).squeeze().detach().numpy()

100%|██████████| 4938/4938 [00:00<00:00, 6422.10it/s]


In [171]:
## save down everything
np.savez_compressed(
    'test_data_t5_alt.npz', 
    train_tokens=train_tokens,
    train_counts=train_counts,
    train_times=train_times,
    vocab=vocab,
    embeddings=embeddings_alt
)

In [212]:
def get_t5_word_vec_pooled(word, tokenizer, model):
    tok = tokenizer('hello', return_tensors='pt', add_special_tokens=False)
    emb = model.encoder(
        input_ids=tok['input_ids'],
        attention_mask=tok['attention_mask'],
        return_dict=True,
        output_hidden_states=True
    )
    
    return torch.stack(emb.hidden_states).mean(axis=0).squeeze()

In [213]:
embeddings_pooled = np.zeros((len(vocab), 768))

for w in tqdm(vocab):
    idx = vectorizer.vocabulary_[w]
    embeddings_pooled[idx] = get_t5_word_vec_pooled(w, tokenizer, model).detach().numpy()

100%|██████████| 4938/4938 [06:41<00:00, 12.30it/s]


In [214]:
## save down everything
np.savez_compressed(
    'test_data_t5_pooled.npz', 
    train_tokens=train_tokens,
    train_counts=train_counts,
    train_times=train_times,
    vocab=vocab,
    embeddings=embeddings_pooled
)