In [None]:
import pandas as pd
import numpy as np
import os.path
import gensim.downloader as api
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import nltk
from scipy.spatial.distance import cosine
from gensim.models.keyedvectors import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import time
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import random
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import utils
import os

## Load the CCN Text File

In [None]:
with open('ccnbook_ed4.txt', 'r') as f:
    data = f.read()
    f.close()

## Test Words

In [None]:
w1_list = ['dyslexia', 'dyslexia', 'dyslexia', 'dyslexia', 'dyslexia', 'dyslexia', 'added', 'summary', 'introducing', 'continuous', 'receptive']
w2_list = ['reading problem', 'speech problem', 'speaking problem', 'reading', 'speech', 'speaking', 'adding', 'summarize', 'introduction', 'continuum', 'receptors']

## Test Quiz

In [None]:
ques = ['neural activation function', 'transformation', 'bidirectional connectivity', 'cortex learning', 'object recognition','attention','weight based priming','hippocampus learning', 'dyslexia', 'past tense']
answers = [['spiking rate code membrane potential point', 'interactive bidirectional feedforward', 'language generalization nonwords'], 
           ['emphasizing distinctions collapsing differences', 'error driven hebbian task model based', 'spiking rate code membrane potential point'],
           ['amplification pattern completion','competition inhibition selection binding','language generalization nonwords'],
           ['error driven task based hebbian model','error driven task based','gradual feature conjunction spatial invariance'],
           ['gradual feature conjunction spatial invariance','error driven task based hebbian model','amplification pattern completion'],
           ['competition inhibition selection binding','gradual feature conjunction spatial invariance','spiking rate code membrane potential point'],
           ['long term changes learning','active maintenance short term residual','fast arbitrary details conjunctive'],
           ['fast arbitrary details conjunctive','slow integration general structure','error driven hebbian task model based'],
           ['surface deep phonological reading problem damage', 'speech output hearing language nonwords', 'competition inhibition selection binding'],
           ['overregularization shaped curve', 'speech output hearing language nonwords', 'fast arbitrary details conjunctive']]

## 1. Co-Occurrence Matrix

In [None]:
from collections import defaultdict

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        text = text.lower().split()
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1
    
    vocab = sorted(vocab)
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df, vocab

In [None]:
sentences = data.split('\n')
df, vocab = co_occurrence(sentences, 5)

In [None]:
def transform_to_vec(sentence, len_cutoff=1):
    v = []
    all_words = [word.lower() for word in sentence.split() if len(word) > len_cutoff] 
    for word in all_words:
        if word in words:
            v.append(list(df.loc[[word],:].values)[0])
    v = np.array(v)
    v = np.mean(v, 0)
      
    return v

In [None]:
# Test Words
for word1, word2 in zip(w1_list, w2_list):
    w1_vec, w2_vec = transform_to_vec(word1.lower()), transform_to_vec(word2.lower())
    sim = 1 - cosine(w1_vec, w2_vec)
    print (word1.lower() + ' vs ' + word2.lower() + ': ' + str(sim))

In [None]:
# Test Quiz
for q, ans in zip(ques, answers):
    sims = []
    ques_vec = transform_to_vec(q,)
    print ('Question: ', q)
    print ('Answers: ')
    for a in ans:
        ans_vec = transform_to_vec(a,)
        sim = 1 - cosine(ques_vec, ans_vec)
        sims.append(sim)
        print (a, sim)
    print (np.argmax(sims))
    print ('\n')

## 2. Bag of Words

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)
X = X.transpose().toarray()
vocab = vectorizer.vocabulary_

In [None]:
def transform_to_vec(sentence, len_cutoff=1):
    v = []
    all_words = [word.lower() for word in sentence.split() if len(word) > len_cutoff] 
    for word in all_words:
        if word in words:
            v.append(list(X[vocab[word]]))
    v = np.array(v)
    v = np.mean(v, 0)
      
    return v

In [None]:
# Test Words
for word1, word2 in zip(w1_list, w2_list):
    w1_vec, w2_vec = transform_to_vec(word1.lower()), transform_to_vec(word2.lower())
    sim = 1 - cosine(w1_vec, w2_vec)
    print (word1.lower() + ' vs ' + word2.lower() + ': ' + str(sim))

In [None]:
# Test Quiz
for q, ans in zip(ques, answers):
    sims = []
    ques_vec = transform_to_vec(q,)
    print ('Question: ', q)
    print ('Answers: ')
    for a in ans:
        ans_vec = transform_to_vec(a,)
        sim = 1 - cosine(ques_vec, ans_vec)
        sims.append(sim)
        print (a, sim)
    print (np.argmax(sims))
    print ('\n')

## 3. word2vec general 

In [None]:
wv = api.load('word2vec-google-news-300')

In [None]:
def transform_to_vec(sentence, len_cutoff=1):
    vectors = []
    
    for doc in sentence:
        v = []
        words = [word.lower() for word in sentence.split() if len(word) > len_cutoff] 
        for word in words:
            if word in wv:
                v.append(wv[word])
        v = np.array(v)
        v = np.mean(v, 0)
      
    return v

In [None]:
# Test Words
for word1, word2 in zip(w1_list, w2_list):
    w1_vec, w2_vec = transform_to_vec(word1.lower()), transform_to_vec(word2.lower())
    sim = 1 - cosine(w1_vec, w2_vec)
    print (word1.lower() + ' vs ' + word2.lower() + ': ' + str(sim))

In [None]:
# Test Quiz
for q, ans in zip(ques, answers):
    sims = []
    ques_vec = transform_to_vec(q)
    print ('Question: ', q)
    print ('Answers: ')
    for a in ans:
        ans_vec = transform_to_vec(a)
        sim = 1 - cosine(ques_vec, ans_vec)
        sims.append(sim)
        print (a, sim)
    print (np.argmax(sims))
    print ('\n')

## 4. word2vec custom training

In [None]:
words = utils.preprocess(data)
print("Total words: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))
vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

In [None]:
threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

In [None]:
def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)

In [None]:
def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y

In [None]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

In [None]:
n_vocab = len(int_to_vocab)
n_embedding = 300 # Number of embedding features 
with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

In [None]:
# Number of negative labels to sample
n_sampled = 1000
with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_vocab))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_vocab)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

In [None]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 64 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

In [None]:
!mkdir checkpoints
window_size = 10

In [None]:
epochs = 10000
batch_size = 1024
window_size = 10

train = False
if train:
    with train_graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=train_graph) as sess:
        iteration = 1
        loss = 0
        sess.run(tf.global_variables_initializer())

        for epoch in range(1, epochs+1):
            batches = get_batches(train_words, batch_size, window_size)
            start = time.time()
            for x, y in batches:

                feed = {inputs: x,
                        labels: np.array(y)[:, None]}
                train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)

                loss += train_loss

                if epoch % 1000 == 0: 
                    end = time.time()
                    print("Epoch {}/{}".format(epoch, epochs),
                          "Iteration: {}".format(iteration),
                          "Avg. Training loss: {:.4f}".format(loss/100),
                          "{:.4f} sec/batch".format((end-start)/100))
                    loss = 0
                    start = time.time()

                if epoch % 1000 == 0:
                    # note that this is expensive (~20% slowdown if computed every 500 steps)
                    sim = similarity.eval()
                    for i in range(valid_size):
                        valid_word = int_to_vocab[valid_examples[i]]
                        top_k = 8 # number of nearest neighbors
                        nearest = (-sim[i, :]).argsort()[1:top_k+1]
                        log = 'Nearest to %s:' % valid_word
                        for k in range(top_k):
                            close_word = int_to_vocab[nearest[k]]
                            log = '%s %s,' % (log, close_word)
                        print(log)

                iteration += 1
            save_path = saver.save(sess, "checkpoints/ccn.ckpt")
            embed_mat = sess.run(normalized_embedding)
else:
    with train_graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=train_graph) as sess:
        saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
        embed_mat = sess.run(embedding)

In [None]:
def transform_to_vec(sentence, len_cutoff=1):
    vectors = []
    
    for doc in sentence:
        v = []
        words = [word.lower() for word in sentence.split() if len(word) > len_cutoff] 
        for word in words:
            if word in vocab_to_int:
                v.append(embed_mat[vocab_to_int[word]])
        v = np.array(v)
        v = np.mean(v, 0)
      
    return v

In [None]:
# Test words
for word1, word2 in zip(w1_list, w2_list):
    w1_vec, w2_vec = transform_to_vec(word1.lower()), transform_to_vec(word2.lower())
    sim = 1 - cosine(w1_vec, w2_vec)
    print (word1.lower() + ' vs ' + word2.lower() + ': ' + str(sim))

In [None]:
# Test Quiz
for q, ans in zip(ques, answers):
    sims = []
    ques_vec = transform_to_vec(q)
    print ('Question: ', q)
    print ('Answers: ')
    for a in ans:
        ans_vec = transform_to_vec(a)
        sim = 1 - cosine(ques_vec, ans_vec)
        sims.append(sim)
        print (a, sim)
    print (np.argmax(sims))
    print ('\n')

## 5. BERT

In [None]:
!pip install transformers
import torch
from transformers import BertTokenizer, BertModel

import logging

import matplotlib.pyplot as plt
% matplotlib inline

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def sentence_embedding(text):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    model = BertModel.from_pretrained('bert-base-uncased',
                                    output_hidden_states = True, # Whether the model returns all hidden-states.
                                    )
    model.eval()
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings.size()

    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings.size()

    token_embeddings = token_embeddings.permute(1,0,2)

    token_vecs_cat = []

    for token in token_embeddings:
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        token_vecs_cat.append(cat_vec)

    token_vecs_sum = []
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

    token_vecs = hidden_states[-2][0]
    s = torch.mean(token_vecs, dim=0)
    return s

In [None]:
# Test words
for word1, word2 in zip(w1_list, w2_list):
    w1_vec, w2_vec = sentence_embedding(word1.lower()), sentence_embedding(word2.lower())
    sim = 1 - cosine(w1_vec, w2_vec)
    print (word1.lower() + ' vs ' + word2.lower() + ': ' + str(sim))

In [None]:
# Test Quiz
for q, ans in zip(ques, answers):
    sims = []
    ques_vec = sentence_embedding(q)
    print ('Question: ', q)
    print ('Answers: ')
    for a in ans:
        ans_vec = sentence_embedding(a)
        sim = 1 - cosine(ques_vec, ans_vec)
        sims.append(sim)
        print (a, sim)
    print (np.argmax(sims))
    print ('\n')

## Visualize

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
viz_words = 500
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])

In [None]:
fig, ax = plt.subplots(figsize=(14, 14))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)