In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [3]:
df = pd.read_parquet('data/combined_clean.parquet')

In [4]:
df

Unnamed: 0,date,text,type,clean_text
0,1994-02-04,Chairman Alan Greenspan announced today that t...,fomc_statement,announce today federal open market committee d...
1,1994-03-22,Chairman Alan Greenspan announced today that t...,fomc_statement,announce today federal open market committee d...
2,1994-04-18,Chairman Alan Greenspan announced today that t...,fomc_statement,announce today increase slightly degree pressu...
3,1994-05-17,The Federal Reserve today announced two action...,fomc_statement,reserve today announce action design maintain ...
4,1994-08-16,The Federal Reserve announced today the follow...,fomc_statement,reserve announce today follow monetary policy ...
...,...,...,...,...
1645,2022-07-27,"The Federal Reserve, the central bank of the U...",fomc_statement,central bank provide nation safe flexible sta...
1646,2022-09-21,"The Federal Reserve, the central bank of the U...",fomc_statement,central bank provide nation safe flexible sta...
1647,2022-11-02,"The Federal Reserve, the central bank of the U...",fomc_statement,central bank provide nation safe flexible sta...
1648,2022-12-14,"The Federal Reserve, the central bank of the U...",fomc_statement,central bank provide nation safe flexible sta...


In [90]:
alt_vectorizer = CountVectorizer(min_df=10, max_df=0.9).fit(df['clean_text'])
alt_counts = alt_vectorizer.transform(df['clean_text'])
alt_counts.shape[1]

6423

In [91]:
total_counts = np.squeeze(np.asarray(alt_counts.sum(axis=0)))

In [92]:
alt_vectorizer.get_feature_names_out()[2294]

'financial'

In [93]:
alt_counts[:, alt_vectorizer.vocabulary_['inflation']].sum()

15733

In [94]:
[alt_vectorizer.get_feature_names_out()[x] for x in  np.argpartition(total_counts, -10)[-10:]]

['economy',
 'monetary',
 'year',
 'price',
 'bank',
 'inflation',
 'rate',
 'policy',
 'risk',
 'financial']

In [5]:
vectorizer = CountVectorizer().fit(df['clean_text'])
counts = vectorizer.transform(df['clean_text'])

In [6]:
train_tokens = []
train_counts = []
for c in tqdm(counts):
    train_tokens.append(np.nonzero(c)[1])
    train_counts.append(np.squeeze(np.array(c[c > 0])))
    
train_tokens = np.array(train_tokens, dtype='object')
train_counts = np.array(train_counts, dtype='object')

1650it [00:00, 5079.69it/s]


In [7]:
## arbitrarily split it into 5 even chunks
slices = np.array([int(i * df.shape[0] / 5) for i in range(0, 5)])
train_times = np.array([list(slices).index(slices[slices <= i].max()) for i in range(df.shape[0])])

In [8]:
vocab = list(vectorizer.vocabulary_.keys())

In [9]:
embeddings = np.zeros((len(vocab), 96))

In [12]:
import spacy
model = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
for w in tqdm(vocab):
    idx = vectorizer.vocabulary_[w]
    embeddings[idx] = model(w).vector

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25297/25297 [01:05<00:00, 385.51it/s]


In [23]:
(embeddings.sum(axis=1) == 0).max()  # check that all were filled in

False

In [27]:
## save down everything
np.savez_compressed(
    'test_data.npz', 
    train_tokens=train_tokens,
    train_counts=train_counts,
    train_times=train_times,
    vocab=np.array(vocab),
    embeddings=embeddings
)

In [34]:
for k, v in vectorizer.vocabulary_.items():
    if v == 4273:
        print(k)

coax


In [33]:
embeddings[16647]

array([ 8.04942966e-01,  2.82066727e+00, -2.45993696e-02,  3.71220708e-03,
        3.15880626e-02,  8.19706023e-01,  5.88477373e-01,  9.99101698e-01,
       -7.47424901e-01, -6.34844840e-01, -1.01221287e+00, -7.57269800e-01,
       -1.44968343e+00, -6.64046705e-01,  7.97759295e-01, -5.73712945e-01,
        1.13579583e+00, -1.33477771e+00, -4.35051203e-01, -1.92907721e-01,
        9.12154078e-01, -7.18848288e-01, -1.73660919e-01,  2.30327535e+00,
       -7.06190169e-02,  1.19800162e+00,  1.38592720e-03,  2.60853916e-01,
        1.33614397e+00, -1.77492058e+00, -1.23783541e+00, -2.07798705e-02,
        1.40995812e+00, -2.40868449e-01, -8.48563790e-01,  5.94857931e-01,
       -1.47608325e-01,  3.34931135e-01, -1.26378238e-02,  7.23120749e-01,
       -3.38393062e-01,  5.97459733e-01,  8.26892436e-01, -1.06518865e-01,
       -9.42761421e-01,  4.34865892e-01,  6.35469675e-01, -2.15899676e-01,
        7.72261858e-01, -8.60323548e-01, -2.22877577e-01, -1.18963599e+00,
       -5.50807834e-01,  

In [35]:
vectorizer.vocabulary_['operationsthe']

15681

In [28]:
vectorizer.vocabulary_

{'announce': 1508,
 'today': 22892,
 'federal': 8700,
 'open': 15656,
 'market': 13688,
 'committee': 4458,
 'decide': 5886,
 'increase': 11390,
 'slightly': 20859,
 'degree': 6048,
 'pressure': 17424,
 'reserve': 19169,
 'position': 17134,
 'action': 784,
 'expect': 8293,
 'associate': 1958,
 'small': 20902,
 'short': 20585,
 'term': 22557,
 'money': 14488,
 'interest': 11972,
 'rate': 18279,
 'decision': 5894,
 'take': 22310,
 'accommodative': 695,
 'stance': 21366,
 'monetary': 14482,
 'policy': 16982,
 'order': 15741,
 'sustain': 22117,
 'enhance': 7749,
 'economic': 7306,
 'expansion': 8285,
 'chairman': 3826,
 'immediately': 11113,
 'avoid': 2200,
 'misunderstanding': 14367,
 'purpose': 18011,
 'give': 9809,
 'fact': 8501,
 'firming': 8926,
 'condition': 4748,
 'early': 7230,
 'design': 6259,
 'maintain': 13519,
 'favorable': 8660,
 'trend': 23192,
 'inflation': 11576,
 'board': 2909,
 'approve': 1707,
 'discount': 6632,
 'percent': 16523,
 'effective': 7410,
 'agree': 1125,
 'al