In [60]:
from sklearn.manifold import TSNE
from collections import Counter
from six.moves import cPickle
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
import os
import sys
import io
import logging
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk
import json

from gensim.models import word2vec as w2v
nltk.download('punkt') 

[nltk_data] Downloading package punkt to /Users/bray/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
def tokenize_sentences(sentences):
    ret = []
    max_s = len(sentences)
    print("Got " + str(max_s) + " sentences.")
    for count, s in enumerate(sentences):
        tokens = []
        words = re.split(r'(\s+)', s)
        if len(words) > 0:
            for w in words:
                if w is not None:
                    w = w.strip()
                    w = w.lower()
                    if w.isspace() or w == "\n" or w == "\r":
                        w = None
                    if len(w) < 1:
                        w = None
                    if w is not None:
                        tokens.append(w)
        if len(tokens) > 0:
            ret.append(tokens)
        if count % 50 == 0:
            print_progress(count, max_s)
    return ret

In [62]:
def clean_sentences(tokens):
    all_stopwords = load_json("stopwords-iso.json")
    extra_stopwords = ["to", "lle", "h.", "oo", "on", "muk", "kov", "km", "ia", "täm", "sy", "but", ":sta", "hi", "py", "xd", "rr", "x:", "smg", "kum", "uut", "kho", "k", "04n", "vtt", "htt", "the", "kin", "#8", "van", "tii", "lt3", "g", "ko", "ett", "mys", "tnn", "hyv", "tm", "mit", "tss", "siit", "pit", "viel", "of", "n", "saa", "tll", "eik", "nin", "nii", "t", "tmn", "lsn", "j", "miss", "pivn", "yhn", "mik", "tn", "tt", "sek", "lis", "mist", "tehd", "sai", "l", "thn", "mm", "k", "ku", "s", "hn", "nit", "s", "no", "m", "ky", "tst", "mut", "nm", "y", "lpi", "siin", "a", "in", "ehk", "h", "e", "piv", "oy", "p", "yh", "sill", "min", "o", "va", "el", "tyn", "na", "the", "tit", "to", "iti", "tehdn", "tlt", "ois", ":", "v", "?", "!", "&","//","href","\\","``"]
    stopwords = None
    if all_stopwords is not None:
        stopwords = all_stopwords["fi"]
        stopwords += extra_stopwords
    ret = []
    max_s = len(tokens)
    for count, sentence in enumerate(tokens):
        if count % 50 == 0:
            print_progress(count, max_s)
        cleaned = []
        for token in sentence:
            if len(token) > 0:
                if stopwords is not None:
                    for s in stopwords:
                        if token == s:
                            token = None
                if token is not None:
                    if re.search("^[0-9\.\-\s\/]+$", token):
                        token = None
                if token is not None:
                    cleaned.append(token)
            if len(cleaned) > 0:
                ret.append(cleaned)
    return ret

In [63]:
def get_word_frequencies(corpus):
    frequencies = Counter()
    for sentence in corpus:
        for word in sentence:
            frequencies[word] += 1
    freq = frequencies.most_common()
    return freq


In [64]:
def try_load_or_process(filename, processor_fn, function_arg):
    load_fn = None
    save_fn = None
    if filename.endswith("json"):
        load_fn = load_json
        save_fn = save_json
    else:
        load_fn = load_bin
        save_fn = save_bin
        
    if os.path.exists(filename):
        return load_fn(filename)
    else:
        ret = processor_fn(function_arg)
        save_fn(ret, filename)
        return ret

def print_progress(current, maximum):
    sys.stdout.write("\r")
    sys.stdout.flush()
    sys.stdout.write(str(current) + "/" + str(maximum))
    sys.stdout.flush()

def save_bin(item, filename):
    with open(filename, "wb") as f:
        cPickle.dump(item, f)

def load_bin(filename):
    if os.path.exists(filename):
        with open(filename, "rb") as f:
            return cPickle.load(f)

def save_json(variable, filename):
    with io.open(filename, "w", encoding="utf-8") as f:
        f.write(unicode(json.dumps(variable, indent=4, ensure_ascii=False)))

def load_json(filename):
    ret = None
    if os.path.exists(filename):
        try:
            with io.open(filename, "r", encoding="utf-8") as f:
                ret = json.load(f)
        except:
            pass
    return ret

In [65]:
def process_raw_data(input_file):
    valid = u"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#@.:/ äöåÄÖÅ"
    url_match = "(https?:\/\/[0-9a-zA-Z\-\_]+\.[\-\_0-9a-zA-Z]+\.?[0-9a-zA-Z\-\_]*\/?.*)"
    name_match = "\@[\_0-9a-zA-Z]+\:?"
    lines = []
    print("Loading raw data from: " + input_file)
    if os.path.exists(input_file):
        with io.open(input_file, 'r', encoding="utf-8") as f:
            lines = f.readlines()
    num_lines = len(lines)
    ret = []
    for count, text in enumerate(lines):
        if count % 50 == 0:
            print_progress(count, num_lines)
        text = re.sub(url_match, u"", text)
        text = re.sub(name_match, u"", text)
        text = re.sub("\&amp\;?", u"", text)
        text = re.sub("[\:\.]{1,}$", u"", text)
        text = re.sub("^RT\:?", u"", text)
        text = u''.join(x for x in text if x in valid)
        text = text.strip()
        if len(text.split()) > 5:
            if text not in ret:
                ret.append(text)
    return ret

In [66]:
def get_word2vec(sentences):
    num_workers = multiprocessing.cpu_count()
    num_features = 20
    epoch_count = 10
    sentence_count = len(sentences)
    w2v_file = os.path.join(save_dir, "word_vectors.w2v")
    word2vec = None
    if os.path.exists(w2v_file):
        print("w2v model loaded from " + w2v_file)
        word2vec = w2v.Word2Vec.load(w2v_file)
    else:
        word2vec = w2v.Word2Vec(sg=1,
                                seed=1,
                                workers=num_workers,
                                size=num_features,
                                min_count=6,
                                window=5,
                                sample=0)
        print("Building vocab...")
        word2vec.build_vocab(sentences)
        print("Word2Vec vocabulary length:", len(word2vec.wv.vocab))
        print("Training...")
        word2vec.train(sentences, total_examples=sentence_count, epochs=epoch_count)
        print("Saving model...")
        word2vec.save(w2v_file)
    return word2vec

In [67]:
if __name__ == '__main__':
    
    save_dir = 'analysis'
    fname = 'FOMC.json'
    with open(fname) as f:
        content = f.readlines()
        
    content = [x.strip() for x in content]
    sentence = []
    
    for x in content:
        nltk_tokens = nltk.word_tokenize(x)
        sentence.append(nltk_tokens)
        
        
    print("Cleaning tokens")    
    cleaned = clean_sentences(sentence)
    
    print("Getting word frequencies")
    frq = get_word_frequencies(cleaned)
    ss = len(frq)
    print("Unique words: " + str(ss))
    
    print("Instantiating word2vec model")
    word2vec = get_word2vec(sentence)
    vocab = list(word2vec.wv.vocab.keys())
    vocab_len = len(vocab)
    print("word2vec vocab contains " + str(vocab_len) + " items.")
    dim01 = word2vec.wv[vocab[0]].shape[0]
    print("word2vec items have " + str(dim01) + " features.")

Cleaning tokens
500/528Getting word frequencies
Unique words: 1063
Instantiating word2vec model
w2v model loaded from analysis/word_vectors.w2v
word2vec vocab contains 555 items.
word2vec items have 20 features.


In [68]:
import pprint as pp

In [69]:
pp.pprint(word2vec.wv.most_similar('rates', topn=10))

[('lower', 0.7875243425369263),
 ('interest', 0.7857251167297363),
 ('federal', 0.7706447839736938),
 ('rate', 0.763451099395752),
 ('funds', 0.7574094533920288),
 ('paid', 0.7512528896331787),
 ('primary', 0.7429357767105103),
 ('establishment', 0.7386176586151123),
 ('raise', 0.7290594577789307),
 ('unemployment', 0.7193112373352051)]


In [70]:
from gensim.models.word2vec import Word2Vec   
import pickle
w2v_file = os.path.join(save_dir, "word_vectors.w2v")
model = Word2Vec.load(w2v_file)  #WORD_vector.w2v是我已经生成的模型
 
print(model.wv.index2word)    #獲得所有的詞彙

['the', ',', 'of', 'to', 'and', '.', '``', 'in', 'Committee', "''", 'Federal', 'a', 'rate', ';', 'inflation', 'percent', 'that', 'Reserve', 'will', 'at', 'The', 'for', 'securities', '<', '>', 'its', 'market', 'policy', 'Open', 'Market', 'federal', 'funds', "'s", 'on', 'economic', 'agency', 'conditions', '2', 'range', 'labor', 'has', 'as', 'operations', 'monetary', 'by', ':', 'mortgage-backed', 'target', 'with', 'are', 'Desk', 'from', 'expected', '/a', 'href=\\', 'is', 'In', 'directs', 'System', 'information', 'Treasury', 'measures', 'necessary', 'Board', 'over', 'payments', 'be', '[', ']', 'holdings', '$', 'billion', 'principal', 'maintain', 'action', 'This', 'developments', 'stance', 'continue', 'employment', 'Governors', 'activity', 'have', 'been', 'remain', 'expectations', 'maximum', 'objective', 'including', 'financial', 'further', 'implement', 'outlook', 'repurchase', 'regarding', 'received', 'recent', 'or', 'New', 'York', 'voted', 'reverse', 'adjustments', 'open', 'day', 'indicat

In [71]:
#下方是所有詞彙之個別詞向量
for word in model.wv.index2word:
    print(word,model[word]) 

  This is separate from the ipykernel package so we can avoid doing imports until


the [ 0.81828713 -0.09419951 -0.8976326   0.6634239  -0.27133456 -0.12169682
 -0.7159239  -0.6627891   0.7825207   0.5706934   1.1251359  -0.12269952
 -0.23721167 -0.2968914  -0.789592    0.7159784  -0.7445598   0.3158887
 -0.6129253   0.1503234 ]
, [-0.4862149  -0.9763807  -0.32862112  0.08487158 -0.8246975  -0.65790576
  0.8180908   0.32483017 -0.55432343  0.42611858  0.66027623 -0.67931694
 -0.01999711  0.64894253 -0.15430415 -0.04898752 -0.34702092  0.47467718
 -0.06276298 -0.36680526]
of [-0.6288902  -0.05440478 -0.71819186 -0.04372467  1.1358664  -0.21301681
 -0.61157     0.03899534 -0.1222536   1.0892292   0.45347765  0.13858536
 -0.33791515  0.98079586  0.7404729   1.1760286  -0.3503008   0.3801416
 -0.8291922  -0.51516783]
to [-0.08624625 -0.2044463  -0.73356354 -0.19311193 -0.37301818  0.6907422
 -1.083521    1.3263118   0.44584274  0.6003299   1.028213   -0.29746798
 -0.2775775   0.12037443  0.03686144 -0.3575258   1.1715859   0.5148448
 -1.0705296  -0.22964223]
and [ 1.0968

 -0.01733248 -0.15409812]
reinvestments [ 0.89385533  0.33055678 -0.8382946  -0.40403074  1.4896908   0.5406582
  0.38881803  0.2826348   0.59312916  0.29725477  0.33174762 -1.1588213
 -0.29996154  0.63955444 -1.1731559  -0.54492867 -0.63498896  1.7419782
 -1.0245998   0.1972554 ]
risks [ 1.0980535  -1.1304654  -0.5035617   0.0398721  -0.3520432  -0.4270346
 -0.99878514  0.11215083  0.62109864  1.3248193   0.21275368  0.8767447
 -0.5743278   0.36165196 -0.27735037 -0.6869263   0.04090898  0.9929528
 -0.641256   -0.70387787]
appear [ 0.9530995  -1.4043404   0.09576844  0.68538964 -0.4017522   0.1742632
 -0.8488268  -0.10598463  0.4076869   1.6586508   0.2765183   0.56738263
 -0.8280853   0.30897626 -0.684704   -0.87513787  0.12892692  1.4153239
 -0.47806546 -0.14375219]
2016 [-0.72085536 -0.033063   -0.11617226  1.3805299  -0.5201974   1.0607972
  0.5020159  -0.8143531  -0.4570458   0.89784443  0.18552116 -1.1570941
  0.38568678  0.43802658 -0.7492457  -0.07751353 -0.21274601  0.9016816