### Data Download
The [Hindi Text Short Summarization Corpus](https://www.kaggle.com/disisbig/hindi-text-short-summarization-corpus#) is used for this experiment.
This cell will download and extract the data.

In [None]:
!mkdir -p dataset
!wget "https://storage.googleapis.com/kaggle-data-sets/465845/875288/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20200920%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20200920T140724Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=65a3f8130fe23114ad7957d0be8b3df7237254eb40655fb6d203eeff28fd7e54a41e84a54c17b19d71404c5192af3c2f9ab8eea429e203ed8d04f4d2f3a9ceadc80082705c10fef97034a4e5cb1d1e3325f58f7de2c40e2d7a60443e467098c306e0cbafd91dc60f1e7636c7cbcd901f46f6f860893bb7b7cb6728acc62ed323ab5c1346f23216c932a0c04d34cde001eb7b8030e21f76bd7c246c94114f3f6897ef28f55a9d998e17f56e4148a833c8f3fe1128709b07f9cc9902c9e686301ad6db222a86e30b0a5a9522fc005842361f4288736571a095f038b2d57ed2ecfcb9077d368f52289bdb19eff1b7261b5d76b487108ab2deee5df843d659b88cf9" -O dataset/dataset-archive.zip
!unzip dataset-archive.zip -d dataset

### Data Preprocessing
We have a huge collection of news articles and their corresponding headlines. Since this is a word-meaning modelling task, we don't need the correspondences.

In [None]:
# Imports!
import re
import string
import pandas as pd
import pickle
from tqdm.auto import tqdm
import random
from math import sqrt
import numpy as np
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Visualize the raw data
df1 = pd.read_csv('dataset/train.csv', delimiter=',')
df1.dataframeName = 'train.csv'
nRow, nCol = df1.shape
train_articles = df1['article']
train_headlines = df1['headline']
print(f'There are {nRow} rows and {nCol} columns')
print(df1.head(5))


In [None]:
# Prepare the string, remove URLs, emoji, numeral-numbers, multiple spaces and non-string entities. 
def prep(str):
    try:
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        str = re.sub(r'[a-zA-Z]+', '', str)
        str = re.sub(emoji_pattern, '', str)
        str = re.sub(regex, "", str)
        str = str.translate(str.maketrans('', '', string.punctuation))
        str = re.sub(r"\d+", "", str)
        str = str.replace("\n", "")
        str = re.sub(' +', ' ', str)
        return str.strip()
    except TypeError:
        return None

In [None]:
# Create a directory to store intermedieate data.
!mkdir -p intermedieates

In [None]:
final_articles = []
final_headlines = []
for art, head in zip(train_articles, train_headlines):
    if art is not None and head is not None:
        prep_head = prep(head)
        prep_art = prep(art)
        if prep_head is not None and prep_art is not None:
            len_art = len(prep_art.split())
            len_head = len(prep_head.split())
            if len_art < 800 and len_head < 15:
                final_articles.append(prep_art)
                final_headlines.append(prep_head)
print(f'selected {len(final_articles)} articles out of {nRow}, {100*len(final_articles)/nRow}% selected')
with open("intermedieates/headlines.txt", "w", encoding="utf-8") as headlines_file:
    with open("intermedieates/articles.txt", "w", encoding="utf-8") as articles_file:
        for head, article in zip(final_headlines, final_articles):
            headlines_file.write(head)
            headlines_file.write("\n")
            articles_file.write(article)
            articles_file.write("\n")
print("Written!")

### Selecting Words
It is not possible to account for every single word in the vocublary, so we choose the 16384 most frequent ones to make vectors out of.

In [None]:
with open("intermedieates/articles.txt", "r") as articles_file:
    articles = articles_file.readlines()
with open("intermedieates/headlines.txt", "r") as headlines_file:
    headlines = headlines_file.readlines()
words_and_freq = {}
for article in articles:
    for word in article.split():
        try:
            words_and_freq[word] += 1
        except KeyError:
            words_and_freq[word] = 1

for headline in headlines:
    for word in headline.split():
        try:
            words_and_freq[word] += 1
        except KeyError:
            words_and_freq[word] = 1

pickle.dump(words_and_freq, open('intermedieates/words_and_freq.pkl', 'wb'))

In [None]:
with open("intermedieates/articles.txt", "r") as articles_file:
    articles = articles_file.readlines()
with open("intermedieates/headlines.txt", "r") as headlines_file:
    headlines = headlines_file.readlines()
words_and_freq = pickle.load(open("intermedieates/words_and_freq.pkl", 'rb'))
words_new = ["<m>", "<s>", "<e>", "<d>", "<u>"]
for w in sorted(words_and_freq, key=words_and_freq.get, reverse=True):
    words_new.append(w)
    if len(words_new) >= 16384:
        break

final_articles = []
final_headlines = []
for article, headline in tqdm(zip(articles, headlines)):
    article_set = set(article.split())
    headline_set = set(headline.split())
    if len(article_set) > 0 and len(headline_set) > 0:
        article_diff = article_set.difference(words_new)
        headline_diff = headline_set.difference(words_new)
        unk_ratio_article = len(article_diff) / len(article_set)
        unk_num_headline = len(headline_diff) / len(headline_set)
        if unk_ratio_article < 0.1 and unk_num_headline < 0.1:
            final_articles.append(article)
            final_headlines.append(headline)
vocab_head_art = {
    "vocabulary": words_new,
    "articles": final_articles,
    "headlines": final_headlines
}
pickle.dump(vocab_head_art, open("intermedieates/vocab_head_art.pkl", "wb"))



In [None]:
vocab_head_art = pickle.load(open("intermedieates/vocab_head_art.pkl", "rb"))
vocab = vocab_head_art["vocabulary"]
articles = vocab_head_art["articles"]
headlines = vocab_head_art["headlines"]

word2id = {}
id2word = {}
for id, word in enumerate(vocab):
    word2id[word] = id
    id2word[id] = word

unk_id = word2id["<u>"]
start_id = word2id["<s>"]
end_id = word2id["<e>"]
id2freq = [0] * len(vocab)


def get_id(word):
    try:
        id = word2id[word]
    except KeyError:
        id = unk_id
    id2freq[id] += 1
    return id


articles_tokenized = []
headlines_tokenized = []
for article in articles:
    id2freq[start_id] += 1
    id2freq[end_id] += 1
    articles_tokenized.append([start_id] + [get_id(x) for x in article.split()] + [end_id])

for headline in headlines:
    id2freq[start_id] += 1
    id2freq[end_id] += 1
    headlines_tokenized.append([start_id] + [get_id(x) for x in headline.split()] + [end_id])

final_data = {
    "word2id": word2id,
    "id2word": id2word,
    "id2freq": id2freq,
    "headlines_tokenized": headlines_tokenized,
    "articles_tokenized": articles_tokenized
}
pickle.dump(final_data, open("intermedieates/final_data.pkl", "wb"))


In [None]:
!mkdir -p saved-tf-models outputs

In [None]:
# This class supplies pairs of words to our algorithm on demand
class WordPairSupplier:
    def __init__(self, CONTEXT_SIZE=2):
        data = pickle.load(open("intermedieates/final_data.pkl", "rb"))
        self.articles = data["articles_tokenized"]
        self.headlines = data["headlines_tokenized"]
        self.vals = list(range(-CONTEXT_SIZE, 0)) + list(range(1, CONTEXT_SIZE + 1))
        self.id2freq = np.asarray(data['id2freq'])
        self.neg_probs = np.power(self.id2freq, 3 / 4)
        self.neg_probs /= np.sum(self.neg_probs)
        self.ids = np.arange(self.neg_probs.shape[0])
        self.word2id = data['word2id']
        self.id2word = data['id2word']
        self.freqs_three_fourths = [0] * 16384

    def get_skip_prob(self, id):
        return 1 - sqrt((1e3 / self.id2freq[id]))

    def get_pair(self):
        str = random.choice(self.articles)
        L = len(str) - 1
        base_index = random.randint(0, L)
        base_id = str[base_index]
        if random.random() < self.get_skip_prob(base_id):
            return self.get_pair()
        offset = random.choice(self.vals)
        target_index = base_index + offset
        target_index = max(0, min(target_index, L))
        target_id = str[target_index]
        if random.random() < self.get_skip_prob(target_id):
            return self.get_pair()
        return base_id, target_id

    def get_batch(self, batch_size):
        xs = []
        ys = []
        for _ in range(batch_size):
            x, y = self.get_pair()
            xs.append(x)
            ys.append(y)
        xs = np.asarray(xs)
        ys = np.asarray(ys)
        xs = to_categorical(xs, 16384)
        ys = to_categorical(ys, 16384)
        return xs, ys

    def get_batch_raw(self, batch_size, k):
        xs = []
        ys = []
        negs = []
        for _ in range(batch_size):
            x, y = self.get_pair()
            xs.append(x)
            ys.append(y)
            negs.append(self.get_negative_samples(k))
        xs = np.asarray(xs)
        ys = np.asarray(ys)
        negs = np.asarray(negs)
        return xs, ys, negs

    def get_negative_samples(self, k):
        return np.random.choice(self.ids, k, p=self.neg_probs)

    def get_word(self, id):
        return self.id2word[id]

### The actual algorithm
An implementation of the method from [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf).

### Tensorflow Implementation
The naive skip-gram model, without any optimizations is implemented in the cell below, it is easily representable using the building blocks that TensorFlow provides.
The NumPy implementation that includes negative sampling comes ahead.
Stop running the below cell whenever you want, checkpoints are saved regularly.

In [None]:
model = Sequential(layers=[
    Dense(256, activation=None, use_bias=False),
    Dense(16384, activation='softmax', use_bias=False)
])
model.compile(optimizer=Adam(0.01), loss='categorical_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, 16384))
model.summary()
supplier = WordPairSupplier()


def data_generator(num_ex):
    def generate_batch():
        for _ in range(num_ex):
            yield supplier.get_batch(512)

    return generate_batch


saver = ModelCheckpoint("saved-tf-models/saved_embeddings_model.h5", monitor="loss", save_best_only=True)
dataset = tf.data.Dataset.from_generator(data_generator(2048), (tf.float32, tf.float32),
                                         (tf.TensorShape([None, 16384]), tf.TensorShape([None, 16384])))
model.fit(dataset, epochs=16384, callbacks=[saver])

The next cell will extract the weights from the saved checkpoint of the model that was trained in the previous cell.
After running the cell you should see labels.tsv and vectors.tsv in the outputs folder, go to the [Embedding Projector](https://projector.tensorflow.org/) to visualize these.

In [None]:
model = load_model("saved-tf-models/saved_embeddings_model.h5")
weights = model.weights
embeddings = weights[0].numpy()
words = ['नहीं', 'किया', 'समय', 'पहचान']
supplier = WordPairSupplier()
for word in words:
    word_id = supplier.word2id[word]
    word_vector = embeddings[word_id]
    cosine_dists = cosine_similarity(np.asarray([word_vector]), embeddings)[0]
    indxs = np.argsort(cosine_dists)[::-1]
    for j in range(5):
        print(supplier.id2word[indxs[j]])
    print()
with open("outputs/vectors.tsv", 'w') as vectors_out:
    with open("outputs/labels.tsv", 'w') as labels_out:
        for id, vector in enumerate(embeddings):
            for val in vector:
                vectors_out.write(str(val) + "\t")
            vectors_out.write("\n")
            labels_out.write(supplier.id2word[id] + "\n")

### NumPy Implementation
The NumPy implementation with negative sampling to speed up the process is implemented in the next few cells
My notes which might offer an explanation are in the repo too.

In [None]:
from tqdm.auto import trange

VOCAB_SIZE = 16384
DIM = 256
word_vectors = np.random.randn(VOCAB_SIZE * DIM).reshape((VOCAB_SIZE, DIM)) / 10000
context_vectors = np.random.randn(VOCAB_SIZE * DIM).reshape((VOCAB_SIZE, DIM)) / 10000
LR = 0.01


def one_pass_increment(center, target):
    coeff = (1 / (np.exp(word_vectors[center] * context_vectors[target]) + 1))
    word_vectors[center] += (context_vectors[target] * coeff) * LR
    context_vectors[target] += (word_vectors[center] * coeff) * LR


def one_pass_neg(centers, negs):
    for center, neg in zip(centers, negs):
        center = np.repeat(center, negs.shape[1])
        coeff = (1 / (np.exp(word_vectors[center] * context_vectors[neg]) + 1))
        word_vectors[center] -= (context_vectors[neg] * coeff) * LR
        context_vectors[neg] -= (word_vectors[center] * coeff) * LR


supplier = WordPairSupplier()
for i in trange(1000):
    batch = supplier.get_batch_raw(4096, 8)
    one_pass_increment(batch[0], batch[1])
    one_pass_neg(batch[0], batch[2])
    if i % 50 == 0:
        print("Saving for ", i)
        np.save("word_vectors.npy", word_vectors)
        np.save("context_vectors.npy", context_vectors)