In [137]:
import pandas as pd
import re
import numpy as np

# dataset imported from
csv_location = r''
twitters = pd.read_csv(csv_location, usecols=['text'])
twitters

vocabSize = 1000

In [138]:
def preprocess(texts, vocabSize):
    ppt = []

    for text in texts:
        # removing everything after @
        filteredText = re.sub(r'@\S+', "", text);
        # removing URLS
        filteredText = re.sub(r'http\S+', '', filteredText)
        filteredText = re.sub(r'http\S+', '', filteredText)
        # removing everything that is not a letter or space
        filteredText = re.sub(r'[^\w ]', u'', filteredText)
        filteredText = re.sub(r'[^a-zA-Z\s]', u'', filteredText, flags=re.UNICODE)
        # removing trailing whitespaces and \n
        filteredText = re.sub(' +', ' ', filteredText)
        filteredText = filteredText.strip()
        filteredText = filteredText.replace('\n', '')
        # lower casing
        ppt.append(filteredText.lower())

    # keeping only the first vocabSize words of the text
    # first finding the vocabSize first elements
    t = ' '.join(ppt)
    t = t.split(' ')
    d = dict()
    for word in t:
        d[word] = 0
    for word in t:
        d[word] += 1
    wordCount = dict(sorted(d.items(), key=lambda item: item[1], reverse=True))
    # making a dictionary of words to keep or to discard (discarded words will be 0 in the dic)
    wordsKept = dict()
    i = 0
    for word in wordCount:
        if i < vocabSize:
            wordsKept[word] = word
        else:
            wordsKept[word] = '0'
        i += 1
    vocabText = []
    for sentence in ppt:
        t = sentence.split(' ')
        s = []
        for word in t:
            s.append(wordsKept[word])
        vocabText.append(' '.join(s))
    # creating a new dict that keeps only vocabSize words
    i = 0
    smallWordCount = dict()
    for word, count in wordCount.items():
        smallWordCount[word] = count
        i += 1
        if i >= vocabSize:
            break
    # returning the preprocessed text with only the first vocabSize most frequent words
    # returning the count of that vocabulary
    return vocabText, smallWordCount

In [139]:
def words2int(texts):
    t = ' '.join(texts)
    t = t.split(' ')
    d = dict()
    d = dict([(y,x-1) for x,y in enumerate(sorted(set(t)))])
    # assigning '0' as the last position as to not cause conflict with the neural net later
    d['0'] = len(d) - 1
    intTexts = []
    for sentence in texts:
        t = []
        for word in sentence.split(' '):
            t.append(d[word])
        intTexts.append(t)
    dS = {v: k for k, v in d.items()}
    return intTexts, d, dS 

In [140]:
def shouldKeepWord(word, sample, wordCount, corpusSize):
    # if its a word out of the vocabulary
    if word == '0':
        return False
    
    n = wordCount[word]
    z = n/corpusSize
    p = (np.sqrt(z/sample) + 1) * sample/z
    
    r = np.random.uniform(0,1,1)
    if p > r:
        return True
    return False

In [None]:
def getTrainingData(processedText, vocabSize, wordCount, ISdic, windowSize=5):
    X = []
    y = []
    # the position in the oneHotToken of a word will be it's integer representation
    # for example: word "table" -> SIdic['table'] = 15 -> oneHotTokens[15] represent the oneHotToken of table
    oneHotTokens = np.zeros((vocabSize, vocabSize))
    np.fill_diagonal(oneHotTokens, 1)
    
    corpusSize = 0
    for word, count in wordCount.items():
        corpusSize += count
    for sentence in processedText:
        s = []
        
        # implementing subsampling of frequent words
        for word in sentence:
            if shouldKeepWord(ISdic[word], 0.001, wordCount, corpusSize):
                s.append(word)
        
        for i in range(len(s)):
            centerWord = s[i]
            if centerWord == vocabSize:
                continue
            j = i
            while j < i + windowSize and j < len(sentence)-1:
                j += 1
                if sentence[j] == vocabSize:
                    continue
                X.append(oneHotTokens[centerWord])
                y.append(oneHotTokens[sentence[j]])
            j = i
            while j > i - windowSize and j >= 1:
                j -= 1
                if sentence[j] == vocabSize:
                    continue
                X.append(oneHotTokens[centerWord])
                y.append(oneHotTokens[sentence[j]])
        if len(X) > 100000:
            break
    return X, y

Formula for table: $P(w_i) = \frac{f(w_i)^{3/4}}{\sum \limits _{j=0}^{n}(f(w_j)^{3/4})}$

In [142]:
def createTableForNegSampling(wordCount, tableSize=100000000):
    # creating a 100M size table 
    table = np.array(tableSize*['0'], dtype=object)
    
    probabilities = dict()
    powSum = 0
    for word, count in wordCount.items():
        powSum += pow(count, 3/4)
        
    # calculating the probabilities for each word
    for word, count in wordCount.items():
        probabilities[word] = pow(count, 3/4)/ powSum
    currentIndex = 0
    for word, prob in probabilities.items():
        nIndexes = int(round(prob * tableSize, 2))
        for i in range(currentIndex, currentIndex + nIndexes):
            table[i] = word
        
        currentIndex += nIndexes
    
    return table[:currentIndex]

In [152]:
ppt, wordCount = preprocess(twitters['text'], vocabSize)

KeyboardInterrupt: 

In [144]:
# creating the String to Int and Int to String dictionarys
intTweets, SIdic, ISdic = words2int(ppt)

In [163]:
X, y = getTrainingData(intTweets, vocabSize, wordCount, ISdic)

In [164]:
table = createTableForNegSampling(wordCount)

In [165]:
%run neuralNet.ipynb

In [166]:
# the network has negative sampling
net = Network([vocabSize, 10, vocabSize])
batchSize = 1000
nEpochs = len(X)//batchSize
net = SGD(net, X, y, nNegSamples=5, unigramTable=table, SIdic=SIdic, batchSize=batchSize, nEpochs=nEpochs, learningRate=0.1, lamb=0)

learningRate: 0.1 epochs: 0 error: 1.0014028116375011
learningRate: 0.1 epochs: 1000 error: 0.9746208042432293
learningRate: 0.1 epochs: 2000 error: 0.9746710971635686
learningRate: 0.1 epochs: 3000 error: 0.9743738500364632
learningRate: 0.1 epochs: 4000 error: 0.9742430125472216
learningRate: 0.1 epochs: 5000 error: 0.9743919097601972
learningRate: 0.1 epochs: 6000 error: 0.9743295651572804
learningRate: 0.1 epochs: 7000 error: 0.974274993639156
learningRate: 0.1 epochs: 8000 error: 0.9741443600852878
learningRate: 0.1 epochs: 9000 error: 0.9744101971686799
min error: 0.9739590537171726 on epoch: 9049


In [None]:
def getOneWordVec(net: Network, word: str, SIdic) -> np.array:
    w = SIdic[word]
    oneHotTokens = np.zeros(np.shape(net.z[0])[0])
    oneHotTokens[w] = 1
    net = setInput(net, oneHotTokens)
    return net.w[0][w]

def getWordVec(net: Network) -> np.array:
    w = SIdic[word]
    oneHotTokens = np.zeros(np.shape(net.z[0])[0])
    oneHotTokens[w] = 1
    net = setInput(net, oneHotTokens)
    return net.w[0]

In [178]:
wv1 = getWordVec(net, 'thrones', SIdic)
wv2 = getWordVec(net, 'game', SIdic)
print(wv1)
print(wv2)

[0.070181   0.11089551 0.12523967 0.06543846 0.07075161 0.20877218
 0.18762408 0.04211152 0.07289377 0.15005178]
[0.1575588  0.07987772 0.13130389 0.11445485 0.06520042 0.21607862
 0.12691169 0.03123916 0.10374024 0.13316495]
