In [3]:
# import fasttext
import pandas as pd
import re
from collections import defaultdict

# Produce `vocab.txt` from the `book.txt`

In [4]:
vocab = defaultdict(int)
with open('book.txt') as f, open('book_preprocess.txt', 'w') as out:
    for line in f:
        if line:
            words = re.split(r'\W+', line)
            for word in words:
                if word.isalpha():
                    vocab[word.lower()] += 1
            new_line = " ".join(words)
            out.write(new_line.lower())

In [10]:
newV = {k:v for (k,v) in vocab.items() if v > 2}
print("Before pruning words with frequency: ", len(vocab))
print("After pruning low frequency words (f==2): ", len(newV))

Before pruning words with frequency:  5701
After pruning low frequency words (f==2):  2347


In [11]:
# assigned the pruned vocab newV to vocab
vocab = sorted(list(newV.keys()))

In [13]:
# produce vocab.txt
with open('./edu-out/vocab.txt', 'w') as out:
    for w in vocab:
        out.write(w)
        out.write('\n')

# load wiki-simple model and vectorize `vocab.txt`


- `./fasttext print-word-vectors -input ./wiki-simple/wiki.simple.bin < edu-out/vocab.txt`

# compute word similarity for each pair of words in vocab.txt

In [14]:
# load word vectors into word2vec dictionary
word2vec = {}
with open('./edu-out/vectors.txt') as f:
    for line in f:
        l = line.split()
        w = l[0]
        v = list(map(float, l[1:]))
        word2vec[w] = v

In [15]:
# compute word similarities using scipy 
from scipy import spatial

def getSim(w1, w2):
    return abs(1 - spatial.distance.cosine(word2vec[w1], word2vec[w2]))

data = []
for w1 in vocab:
    for w2 in vocab:
        if w1 < w2:
            similarity = getSim(w1, w2)
            data.append([w1, w2, similarity])
            

# Save to links.csv and nodes.csv

In [17]:
links = pd.DataFrame(data, columns=['from','to', 'sim'])
links.to_csv("./edu-out/links.csv", index=False)

In [18]:
links.head()

Unnamed: 0,from,to,sim
0,a,aaaaaahhh,0.243774
1,a,able,0.287469
2,a,aboard,0.141293
3,a,about,0.341711
4,a,above,0.212498


In [21]:
nodes = vocab
df = pd.DataFrame(data=nodes, columns=['id'])
df.to_csv("./edu-out/nodes.csv", index=False)
df.head()

Unnamed: 0,id
0,a
1,aaaaaahhh
2,able
3,aboard
4,about
