In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from __future__ import division
import argparse
import pandas as pd
from collections import defaultdict
# useful stuff
import numpy as np
from scipy.special import expit
from sklearn.preprocessing import normalize
from nltk.tokenize import word_tokenize
import copy
import pickle
from tqdm import tqdm

In [3]:
import nltk
import ssl

In [4]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
import string


def remove_punctuation(corpus):
    punctuationfree = []
    punctuationfree.extend(i for i in corpus if i not in string.punctuation)
    return punctuationfree

In [6]:
stopwords = nltk.corpus.stopwords.words('english')


def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]

    return output

In [7]:
from nltk.stem import WordNetLemmatizer

# defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()


# defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [8]:
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    with open(path, encoding='utf-8') as f: # 加了encoding='utf-8'，否则读txt时有bug
        for l in f:
            sentences.append(l.lower().split())

    # remove punctuations
    sentences = [remove_punctuation(sent) for sent in sentences]

    # Remove numbers
    sentences = [list(filter(lambda x: x.isalpha(), sent)) for sent in sentences]

    # Lemmatization
    sentences = [lemmatizer(sent) for sent in sentences]

    return sentences

In [9]:
sentences = text2sentences("/content/drive/MyDrive/NLP-skipGram/train.txt")

In [10]:
def loadPairs(path):
    data = pd.read_csv(path, delimiter='\t')
    pairs = zip(data['word1'], data['word2'], data['similarity'])
    return pairs

In [11]:
import math
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [12]:
def cosine_distance(vec1, vec2):
    assert vec1.shape == vec2.shape
    return np.dot(vec1, vec2) / (np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2)))

In [13]:
class mySkipGram:
    def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize=5, minCount=5, learning=0.01):

        # Storing hyper parameters as class variables
        self.nEmbed = nEmbed
        self.negativeRate = negativeRate
        self.winSize = winSize
        self.minCount = minCount
        self.lr = learning

        self.vocab_dict = self.vocab_dict_generator(sentences)
        # word to ID mapping
        self.w2id = {key: values['word_index'] for key, values in self.vocab_dict.items()}
        self.trainset = sentences
        self.vocab = list(self.vocab_dict.keys())  # list of valid words
        self.word_freq = {values['word_index']: values['word_freq'] for key, values in self.vocab_dict.items()}

        # initialize input_embedding matrix and output_weight matrix

        # with a different initialization
        self.input_embedding = np.random.uniform(low=-0.5 / (self.nEmbed ** (3 / 4)),
                                                 high=0.5 / (self.nEmbed ** (3 / 4)),
                                                 size=(len(self.vocab_dict), self.nEmbed))

        # plain choice, all zeros
        # self.input_embedding = np.zeros([len(self.vocab_dict), self.nEmbed])
        self.output_weights = np.zeros([len(self.vocab_dict), self.nEmbed])
        self.G0 = np.zeros_like(self.input_embedding)
        self.G1 = np.zeros_like(self.output_weights)

        # create variables to keep track of model performance
        self.trainWords = 0
        self.accLoss = 0
        self.loss = []

        # create an 'unknown_vector' to represent word in test data that doesn't exist in our train set
        self.unknown_vector = np.random.normal(0, 1, (self.nEmbed,))

    def vocab_dict_generator(self, sentences):
        """generate a dictionary to store information for each single word

        Parameters
        ----------
        sentences: a list of sentences

        Returns
        -------
        vocab: a dictionary
        where key is unique word from corpus and value is a dictionary {'word_count': how many times a word has appeared in the entire train set,
        'word_freq': the frequency of a word, 'word_index': assigning an id to each word}
        """

        vocab = defaultdict(dict)
        vocab_words = ['int']
        vocab['int']['word_count'] = 0
        vocab_size = 0
        for sent_tokens in sentences:
            vocab_size += len(sent_tokens)
            for word in sent_tokens:
                if word not in vocab:
                    vocab[word]['word_count'] = 1
                    vocab_words.append(word)
                else:
                    vocab[word]['word_count'] += 1

        # remove words appearing fewer than min_count times
        low_freq_words = []
        for word in vocab:
            if vocab[word]['word_count'] < self.minCount:
                low_freq_words.append(word)
        for word in low_freq_words:
            vocab_size -= vocab[word]['word_count']
            del vocab[word]
            vocab_words.remove(word)
        sorted_vocab = []
        for word in vocab:
            sorted_vocab.append((word, vocab[word]['word_count']))
        sorted_vocab.sort(key=lambda tup: tup[1], reverse=True)
        for idx, word in enumerate(sorted_vocab):
            vocab[word[0]]['word_freq'] = vocab[word[0]]['word_count'] / vocab_size
            vocab[word[0]]['word_index'] = idx
        return vocab

    def negative_sampling(self, word_tuple, alpha=0.75):
        """samples negative words, ommitting word_tuple, Words that actually appear within the context window of the center word
            and generate ids of words that are randomly drawn from a noise distribution

        Parameters
        ----------
        word_tuple : tuple of {wIdx, ctxtId}
        wIdx is the index of center word, ctxtId is the index of context word

        alpha: a hyper-parameter that can be empircially tuned
        in the noise distribution — normalized frequency distribution of words raised to the power of α.

        Returns
        -------
        negativeIds: a dictionary with key as word, probability of being chosen as value
        representing words that doesn't appear within the context window of the centre word but exist in the corpus

        """
        word_freq_copy = copy.deepcopy(self.word_freq)
        # remove positive sample
        for id in word_tuple:
            word_freq_copy.pop(id)

        # generate noise distribution
        noise_dist = {key: val ** alpha for key, val in word_freq_copy.items()}
        Z = sum(noise_dist.values())
        noise_dist_normalized = {key: val / Z for key, val in noise_dist.items()}

        negativeIds = np.random.choice(list(noise_dist_normalized.keys()), size=self.negativeRate,
                                       p=list(noise_dist_normalized.values()))

        return negativeIds

    def loss_function(self, wordId, contextId, negativeIds):
        """ Returns the loss for the given word, its context and the negative samples"""

        l_sum = np.log(sigmoid(np.dot(self.input_embedding[wordId, :], self.output_weights[contextId, :])))

        for negativeId in negativeIds:
            l_sum *= np.log(sigmoid(-np.dot(self.input_embedding[negativeId, :], self.output_weights[contextId, :])))

        return l_sum

    def train(self):
        for counter, sentence in enumerate(tqdm(self.trainset)):
            sentence = list(filter(lambda word: word in self.vocab, sentence))

            for wpos, word in enumerate(sentence):
                wIdx = self.w2id[word]
                # dynamic window size, the winSize denotes the maximal window size. For each word in the corpus, a window size k' is randomly sampled uniformly from 1,,,,winSize
                winsize = np.random.randint(self.winSize) + 1
                start = max(0, wpos - winsize)
                end = min(wpos + winsize + 1, len(sentence))

                for context_word in sentence[start:end]:
                    ctxtId = self.w2id[context_word]
                    if ctxtId == wIdx: continue
                    negativeIds = self.negative_sampling({wIdx, ctxtId})
                    self.trainWord(wIdx, ctxtId, negativeIds)

                    # keep record of loss during training
                    self.accLoss += self.loss_function(wIdx, ctxtId, negativeIds)
                    self.trainWords += 1

            if (counter+1) % 100 == 0:
                # print(' > training %d of %d' % (counter, len(self.trainset)))
                self.loss.append(self.accLoss / self.trainWords)
                self.trainWords = 0
                self.accLoss = 0
                # print(self.loss[-1])

    # Back propagation

    def trainWord(self, wordId, contextId, negativeIds):

        # positive_loss = np.log(1 / (1 + np.exp(-np.dot(self.W[wIdx], self.C[ctxId]))))
        # negative_loss = 0
        # for i in negativeIds:
        #     negative_loss += expit(-np.dot(self.W[wIdx], self.C[i]))  # remove - sign
        # self.accLoss -= (negative_loss + positive_loss)

        neg_sample = [(wordId, 1)]
        wv_h = self.input_embedding[contextId]
        # For each positive word-context pair (w,cpos),
        # K new negative samples are randomly drawn from a noise distribution.

        for neg_word in negativeIds:
            neg_sample.append((neg_word, 0))

        # Adagrad
        dh = np.zeros(self.nEmbed)

        for neg_w in neg_sample:
            target, label = neg_w[0], neg_w[1]

            wv_j = self.output_weights[target]
            dwjh = sigmoid(np.dot(wv_h, wv_j)) - label
            dwj = dwjh * wv_h
            self.G1[target] += np.power(dwj, 2)
            dwj /= (np.sqrt(self.G1[target]) + 1e-6)  # to avoid 0 in denominator
            assert dwj.shape == wv_j.shape
            dh += dwjh * wv_j
            # Update the output weight matrix
            self.output_weights[target] -= self.lr * dwj

        # Update the input embedding matrix
        self.G0[contextId] += np.power(dh, 2)
        dh /= np.sqrt(self.G0[contextId]) + 1e-6
        assert dh.shape == wv_h.shape
        self.input_embedding[contextId] -= self.lr * dh

    def save(self, path):
        pickle.dump(self, open(path, 'wb'))
        # with open(path, 'wb') as f:
        #
        #     np.save(f, self.input_embedding, allow_pickle=False, fix_imports=True)

    def similarity(self, word1, word2):
        """
        computes similiarity between the two words measured by consine distance. unknown words are mapped to one common vector
        :param word1:
        :param word2:
        :return: a float \in [0,1] indicating the similarity (the higher the more similar)
        """

        if word1 == word2:
            print('same word')
            return 1
        else:
            vec1 = self.input_embedding[self.w2id[word1]] if word1 in self.vocab else self.unknown_vector
            vec2 = self.input_embedding[self.w2id[word2]] if word2 in self.vocab else self.unknown_vector
            return np.dot(vec1, vec2) / (np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2)))

    @staticmethod
    def load(path):
        return pickle.load(open(path, 'rb'))

In [14]:
# sg = mySkipGram(sentences)

In [16]:
# sg.train()

In [17]:
# sg.save('/content/drive/MyDrive/NLP-skipGram/saved_model/model.pth')

In [18]:
sg.loss

[0.06538296957193264]

In [19]:
# nEmbed=100, negativeRate=5, winSize=5, minCount=5, learning=0.01

In [20]:
nEmbed_range = np.arange(50,200,50)
negativeRate_range = np.arange(5,20,5)
winSize_range = np.arange(3,12,3)
minCount_range = np.arange(0,9,3)
learning_ragne = (0.0001,0.001,0.01,0.1)

In [21]:
len(nEmbed_range)*len(negativeRate_range)*len(winSize_range)*len(minCount_range)*len(learning_ragne)

324

In [22]:
group_dict = {}
loss_values = {}

In [None]:
# random grid search
best_estimator = [0,0,0,0,0]
min_loss = -0.14673792546101652 #float('inf')

for i in range(100):
  nEmbed = np.random.choice(nEmbed_range)
  negativeRate = np.random.choice(negativeRate_range)
  winSize = np.random.choice(winSize_range)
  minCount = np.random.choice(minCount_range)
  learning = np.random.choice(learning_ragne)
  if [nEmbed,negativeRate,winSize,minCount,learning] not in group_dict.values():
    group_dict[i] = [nEmbed,negativeRate,winSize,minCount,learning]
    sg = mySkipGram(sentences,nEmbed, negativeRate, winSize, minCount, learning)
    sg.train()
    loss = sg.loss[-1]
    if loss < min_loss:
        min_loss = loss
        best_estimator = [nEmbed, negativeRate, winSize, minCount, learning]
        print("up to now, best estimator:", best_estimator)
        print("with loss", min_loss)
    loss_values[i] = loss
  else:
    continue


100%|██████████| 1000/1000 [14:26<00:00,  1.15it/s]
100%|██████████| 1000/1000 [02:20<00:00,  7.13it/s]
100%|██████████| 1000/1000 [33:22<00:00,  2.00s/it]
100%|██████████| 1000/1000 [06:34<00:00,  2.54it/s]
100%|██████████| 1000/1000 [03:00<00:00,  5.55it/s]
100%|██████████| 1000/1000 [02:43<00:00,  6.11it/s]
100%|██████████| 1000/1000 [05:01<00:00,  3.32it/s]
100%|██████████| 1000/1000 [04:24<00:00,  3.78it/s]
100%|██████████| 1000/1000 [04:55<00:00,  3.38it/s]
100%|██████████| 1000/1000 [01:21<00:00, 12.23it/s]
 18%|█▊        | 176/1000 [05:50<33:09,  2.41s/it]