In [47]:
import pandas as pd
import re
import numpy as np

# dataset imported from
twitters = pd.read_csv(r'C:\Users\Yann\Desktop\gotTwitter.csv', usecols=['text'])
twitters

vocabSize = 10000

In [48]:
def preprocess(texts, vocabSize=vocabSize):
    ppt = []

    for text in texts:
        # removing everything after @
        filteredText = re.sub(r'@\S+', "", text);
        # removing URLS
        filteredText = re.sub(r'http\S+', '', filteredText)
        filteredText = re.sub(r'http\S+', '', filteredText)
        # removing everything that is not a letter or space
        filteredText = re.sub(r'[^\w ]', u'', filteredText)
        filteredText = re.sub(r'[^a-zA-Z\s]', u'', filteredText, flags=re.UNICODE)
        # removing trailing whitespaces and \n
        filteredText = re.sub(' +', ' ', filteredText)
        filteredText = filteredText.strip()
        filteredText = filteredText.replace('\n', '')
        # lower casing
        ppt.append(filteredText.lower())

    # keeping only the first vocabSize words of the text
    # first finding the vocabSize first elements
    t = ' '.join(ppt)
    t = t.split(' ')
    d = dict()
    for word in t:
        d[word] = 0
    for word in t:
        d[word] += 1
    wordCount = dict(sorted(d.items(), key=lambda item: item[1], reverse=True))
    # making a dictionary of words to keep or to discard (discarded words will be 0 in the dic)
    wordsKept = dict()
    i = 0
    for word in wordCount:
        if i < vocabSize:
            wordsKept[word] = word
        else:
            wordsKept[word] = '0'
        i += 1
    vocabText = []
    for sentence in ppt:
        t = sentence.split(' ')
        s = []
        for word in t:
            s.append(wordsKept[word])
        vocabText.append(' '.join(s))
    # creating a new dict that keeps only vocabSize words
    i = 0
    smallWordCount = dict()
    for word, count in wordCount.items():
        smallWordCount[word] = count
        i += 1
        if i >= vocabSize:
            break
    # returning the preprocessed text with only the first vocabSize most frequent words
    # returning the count of that vocabulary
    return vocabText, smallWordCount

In [49]:
def words2int(texts):
    t = ' '.join(texts)
    t = t.split(' ')
    d = dict([(y,x+1) for x,y in enumerate(sorted(set(t)))])
    intTexts = []
    for sentence in texts:
        t = []
        for word in sentence:
            if word == ' ':
                continue
            t.append(d[word])
        intTexts.append(t)
    dS = {v: k for k, v in d.items()}
    return intTexts, d, dS 

In [50]:
def shouldKeepWord(word, sample, wordCount, corpusSize):
    # if its a word out of the vocabulary
    if word == '0':
        return False
    
    n = wordCount[word]
    z = n/corpusSize
    p = (np.sqrt(z/sample) + 1) * sample/z
    
    r = np.random.uniform(0,1,1)
    if p > r:
        return True
    return False

In [51]:
def getTrainingData(processedText, vocabSize, wordCount, ISdic, windowSize=5):
    X = []
    y = []
    # the position in the oneHotToken of a word will be it's integer representation
    # for example: word "table" -> SIdic['table'] = 15 -> oneHotTokens[15] represent the oneHotToken of table
    oneHotTokens = np.zeros((vocabSize, vocabSize))
    np.fill_diagonal(oneHotTokens, 1)
    
    corpusSize = 0
    for word, count in wordCount.items():
        corpusSize += count
    for sentence in processedText:
        s = []
        
        # implementing subsampling of frequent words
        for word in sentence:
            if shouldKeepWord(ISdic[word], 0.001, wordCount, corpusSize):
                s.append(word)
        
        for i in range(len(s)):
            centerWord = s[i]
            j = i
            while j < i + windowSize and j < len(sentence)-1:
                j += 1
                X.append(oneHotTokens[centerWord])
                y.append(oneHotTokens[sentence[j]])
            j = i
            while j > i - windowSize and j >= 1:
                j -= 1
                X.append(oneHotTokens[centerWord])
                y.append(oneHotTokens[sentence[j]])
        if len(X) > 10000000:
            break
    return X, y

Formula for table: $P(w_i) = \frac{f(w_i)^{3/4}}{\sum \limits _{j=0}^{n}(f(w_j)^{3/4})}$

In [52]:
def createTableForNegSampling(wordCount, tableSize=100000000):
    # creating a 100M size table 
    table = np.array(tableSize*[0], dtype=object)
    
    probabilities = dict()
    powSum = 0
    for word, count in wordCount.items():
        powSum += pow(count, 3/4)
        
    # calculating the probabilities for each word
    for word, count in wordCount.items():
        probabilities[word] = pow(count, 3/4)/ powSum
    currentIndex = 0
    for word, prob in probabilities.items():
        nIndexes = int(prob * tableSize)
        for i in range(currentIndex, currentIndex + nIndexes):
            table[i] = word
        
        currentIndex += nIndexes
    
    return table

In [53]:
ppt, wordCount = preprocess(twitters['text'])

In [54]:
# creating the String to Int and Int to String dictionarys
intTweets, SIdic, ISdic = words2int(ppt)

In [55]:
X, y = getTrainingData(intTweets, vocabSize, wordCount, ISdic)

In [56]:
table = createTableForNegSampling(wordCount)

In [57]:
%run neuralNet.ipynb

I need to get the backprop + setInput time down 
I need to put less words and less features because its taking too long
Right now it takes 10s to run one setInput and backProp (2,5s setInput and 7,5s backProp)
If i get it down to 0.1s and keep the X length to 1M it will take 28h to complete

In [60]:
ISdic[10000]

'zone'

In [59]:
# the network has negative sampling
net = Network([vocabSize, 300, vocabSize])
batchSize = 100
nEpochs = len(X)//batchSize
net = SGD(net, X, y, nNegSamples=5, unigramTable=table, SIdic=SIdic, batchSize=batchSize, nEpochs=nEpochs, learningRate=1, lamb=0.1)

learningRate: 1 epochs: 0 error: 0.015436433573430509, outputs: [0. 0. 0. ... 0. 0. 0.]


KeyboardInterrupt: 