In [317]:
import pandas as pd
import re
import numpy as np

# dataset imported from
twitters = pd.read_csv(r'C:\Users\Yann\Desktop\gotTwitter.csv', usecols=['text'])
twitters

vocabSize = 10000

In [318]:
def preprocess(texts, vocabSize=vocabSize):
    ppt = []

    for text in texts:
        # removing everything after @
        filteredText = re.sub(r'@\S+', "", text);
        # removing URLS
        filteredText = re.sub(r'http\S+', '', filteredText)
        filteredText = re.sub(r'http\S+', '', filteredText)
        # removing everything that is not a letter or space
        filteredText = re.sub(r'[^\w ]', u'', filteredText)
        filteredText = re.sub(r'[^a-zA-Z\s]', u'', filteredText, flags=re.UNICODE)
        # removing trailing whitespaces and \n
        filteredText = re.sub(' +', ' ', filteredText)
        filteredText = filteredText.strip()
        filteredText = filteredText.replace('\n', '')
        # lower casing
        ppt.append(filteredText.lower())

    # keeping only the first vocabSize words of the text
    # first finding the vocabSize first elements
    t = ' '.join(ppt)
    t = t.split(' ')
    d = dict()
    for word in t:
        d[word] = 0
    for word in t:
        d[word] += 1
    wordCount = dict(sorted(d.items(), key=lambda item: item[1], reverse=True))
    # making a dictionary of words to keep or to discard (discarded words will be 0 in the dic)
    wordsKept = dict()
    i = 0
    for word in wordCount:
        if i < vocabSize:
            wordsKept[word] = word
        else:
            wordsKept[word] = '0'
        i += 1
    vocabText = []
    for sentence in ppt:
        t = sentence.split(' ')
        s = []
        for word in t:
            s.append(wordsKept[word])
        vocabText.append(' '.join(s))
    # creating a new dict that keeps only vocabSize words
    i = 0
    smallWordCount = dict()
    for word, count in wordCount.items():
        smallWordCount[word] = count
        i += 1
        if i > vocabSize:
            break
    # returning the preprocessed text with only the first vocabSize most frequent words
    # returning the count of that vocabulary
    return vocabText, smallWordCount

In [319]:
def words2int(texts):
    t = ' '.join(texts)
    t = t.split(' ')
    d = dict([(y,x+1) for x,y in enumerate(sorted(set(t)))])
    intTexts = []
    for sentence in texts:
        t = []
        for word in sentence:
            if word == ' ':
                continue
            t.append(d[word])
        intTexts.append(t)
    dS = {v: k for k, v in d.items()}
    return intTexts, d, dS 

In [320]:
def shouldKeepWord(word, sample, wordCount, corpusSize):
    # if its a word out of the vocabulary
    if word == '0':
        return False
    
    n = wordCount[word]
    z = n/corpusSize
    p = (np.sqrt(z/sample) + 1) * sample/z
    
    r = np.random.uniform(0,1,1)
    if p > r:
        return True
    return False

In [321]:
def getTrainingData(processedText, vocabSize, wordCount, ISdic, windowSize=5):
    X = []
    y = []
    # the position in the oneHotToken of a word will be it's integer representation
    # for example: word "table" -> SIdic['table'] = 15 -> oneHotTokens[15] represent the oneHotToken of table
    oneHotTokens = np.zeros((vocabSize, vocabSize))
    np.fill_diagonal(oneHotTokens, 1)
    
    corpusSize = 0
    for word, count in wordCount.items():
        corpusSize += count
    for sentence in processedText:
        s = []
        
        # implementing subsampling of frequent words
        for word in sentence:
            if shouldKeepWord(ISdic[word], 0.001, wordCount, corpusSize):
                s.append(word)
        
        for i in range(len(s)):
            centerWord = s[i]
            j = i
            while j < i + windowSize and j < len(sentence)-1:
                j += 1
                X.append(oneHotTokens[centerWord])
                y.append(oneHotTokens[sentence[j]])
            j = i
            while j > i - windowSize and j >= 1:
                j -= 1
                X.append(oneHotTokens[centerWord])
                y.append(oneHotTokens[sentence[j]])
        if len(X) > 10000000:
            break
    return X, y

Formula for table: $P(w_i) = \frac{f(w_i)^{3/4}}{\sum \limits _{j=0}^{n}(f(w_j)^{3/4})}$

In [322]:
def createTableForNegSampling(wordCount, tableSize=100000000):
    # creating a 100M size table 
    table = np.array(tableSize*[0], dtype=object)
    
    probabilities = dict()
    powSum = 0
    for word, count in wordCount.items():
        powSum += pow(count, 3/4)
        
    # calculating the probabilities for each word
    for word, count in wordCount.items():
        probabilities[word] = pow(count, 3/4)/ powSum
    currentIndex = 0
    for word, prob in probabilities.items():
        nIndexes = int(prob * tableSize)
        for i in range(currentIndex, currentIndex + nIndexes):
            table[i] = word
        
        currentIndex += nIndexes
    
    return table

In [323]:
ppt, wordCount = preprocess(twitters['text'])

In [324]:
# creating the String to Int and Int to String dictionarys
intTweets, SIdic, ISdic = words2int(ppt)

In [325]:
X, y = getTrainingData(intTweets, vocabSize, wordCount, ISdic)

In [326]:
table = createTableForNegSampling(wordCount)

In [327]:
%run neuralNet.ipynb

In [None]:
I need to get the backprop + setInput time down 
I need to put less words and less features because its taking too long
Right now it takes 10s to run one setInput and backProp
If i get it down to 0.1s and keep the X length to 1M it will take 28h to complete

In [328]:
# the network has negative sampling
net = Network([vocabSize, 300, vocabSize])
batchSize = 20
nEpochs = len(X)//batchSize
net = SGD(net, X, y, nNegSamples=5, unigramTable=table, SIdic=SIdic, batchSize=batchSize, nEpochs=nEpochs, learningRate=1, lamb=0.1)

setInput time: 2.560246467590332
backProp time: 7.720166444778442
setInput time: 2.9293668270111084
backProp time: 7.721383094787598
setInput time: 2.638545513153076
backProp time: 8.51993989944458
setInput time: 2.75927472114563
backProp time: 7.5886757373809814
setInput time: 2.5367913246154785
backProp time: 8.014785766601562
setInput time: 2.7380051612854004
backProp time: 7.711792945861816
setInput time: 2.9044008255004883
backProp time: 7.7179365158081055
setInput time: 2.887101173400879
backProp time: 7.879638195037842
setInput time: 2.6292009353637695
backProp time: 7.797081470489502
setInput time: 2.3496010303497314
backProp time: 7.685680150985718
setInput time: 2.6477255821228027
backProp time: 7.899622678756714
setInput time: 3.0142693519592285
backProp time: 8.744551420211792
setInput time: 3.035557985305786
backProp time: 7.777912855148315
setInput time: 2.458427906036377
backProp time: 7.824692249298096
setInput time: 2.6400742530822754
backProp time: 7.640575408935547
s

backProp time: 7.422397136688232
setInput time: 2.6404826641082764
backProp time: 8.450780868530273
setInput time: 2.6139862537384033
backProp time: 7.435505151748657
setInput time: 2.2635371685028076
backProp time: 7.688797950744629
learningRate: 1 epochs: 5 error: 0.003200000000000001, outputs: [0. 0. 0. ... 0. 0. 0.]
setInput time: 2.225393295288086
backProp time: 6.569661855697632
setInput time: 2.225437641143799
backProp time: 6.396074295043945
setInput time: 2.2096099853515625
backProp time: 6.61565089225769
setInput time: 2.226412773132324
backProp time: 6.441826105117798
setInput time: 2.257704734802246
backProp time: 6.490561485290527
setInput time: 2.1951444149017334
backProp time: 6.643519401550293
setInput time: 2.247711420059204
backProp time: 6.673842668533325
setInput time: 2.209752321243286
backProp time: 6.461698055267334
setInput time: 2.247340679168701
backProp time: 6.519913911819458
setInput time: 2.3844034671783447
backProp time: 6.553394556045532
setInput time: 2

  return np.exp(n)/sum(np.exp(n))
  return np.exp(n)/sum(np.exp(n))


learningRate: 1 epochs: 9 error: nan, outputs: [0. 0. 0. ... 0. 0. 0.]
setInput time: 2.2512943744659424
backProp time: 6.659574031829834
setInput time: 2.3209564685821533
backProp time: 6.673219919204712
setInput time: 2.2659754753112793
backProp time: 6.744943618774414
setInput time: 2.2922024726867676
backProp time: 6.786197662353516
setInput time: 2.40114164352417
backProp time: 7.278871059417725
setInput time: 2.421099901199341
backProp time: 6.843559741973877
setInput time: 2.264939785003662
backProp time: 6.611613750457764
setInput time: 2.4092109203338623
backProp time: 6.942583322525024
setInput time: 2.3325140476226807
backProp time: 6.506432771682739
setInput time: 2.3352174758911133
backProp time: 7.350411415100098
setInput time: 2.4385180473327637
backProp time: 6.737772226333618
setInput time: 2.391016960144043
backProp time: 6.781533479690552
setInput time: 2.3574976921081543
backProp time: 6.642127990722656
setInput time: 2.2855658531188965
backProp time: 6.729708671569

KeyboardInterrupt: 