In [2]:
import pandas as pd
import re
import numpy as np
import NegativeSampling as ns

# dataset imported from
twitters = pd.read_csv(r'C:\Users\Yann\Desktop\gotTwitter.csv', usecols=['text'])
twitters

vocabSize = 20000

In [13]:
def preprocess(texts, vocabSize=vocabSize):
    ppt = []

    for text in texts:
        # removing everything after @
        filteredText = re.sub(r'@\S+', "", text);
        # removing URLS
        filteredText = re.sub(r'http\S+', '', filteredText)
        filteredText = re.sub(r'http\S+', '', filteredText)
        # removing everything that is not a letter or space
        filteredText = re.sub(r'[^\w ]', u'', filteredText)
        filteredText = re.sub(r'[^a-zA-Z\s]', u'', filteredText, flags=re.UNICODE)
        # removing trailing whitespaces and \n
        filteredText = re.sub(' +', ' ', filteredText)
        filteredText = filteredText.strip()
        filteredText = filteredText.replace('\n', '')
        # lower casing
        ppt.append(filteredText.lower())

    # keeping only the first vocabSize words of the text
    # first finding the vocabSize first elements
    t = ' '.join(ppt)
    t = t.split(' ')
    d = dict()
    for word in t:
        d[word] = 0
    for word in t:
        d[word] += 1
    wordCount = dict(sorted(d.items(), key=lambda item: item[1], reverse=True))
    # making a dictionary of words to keep or to discard (discarded words will be 0 in the dic)
    wordsKept = dict()
    i = 0
    for word in wordCount:
        if i < vocabSize:
            wordsKept[word] = word
        else:
            wordsKept[word] = '0'
        i += 1
    vocabText = []
    for sentence in ppt:
        t = sentence.split(' ')
        s = []
        for word in t:
            s.append(wordsKept[word])
        vocabText.append(' '.join(s))
    return vocabText, wordCount

In [15]:
def words2int(texts):
    t = ' '.join(texts)
    t = t.split(' ')
    d = dict([(y,x+1) for x,y in enumerate(sorted(set(t)))])
    intTexts = []
    for sentence in texts:
        t = []
        for word in sentence:
            if word == ' ':
                continue
            t.append(d[word])
        intTexts.append(t)
    dS = {v: k for k, v in d.items()}
    return intTexts, d, dS 

In [17]:
ppt, wordCount = preprocess(twitters['text'])

In [19]:
ppt[:100]

['on game of thrones x breakdown night king symbol explained',
 'on ups and downs from game of thrones',
 'liked on youtube ups and downs from game of thrones',
 'liked on youtube game of thrones x breakdown night king symbol explained',
 'unpopular opinion game of thrones edition',
 'reddit is bursting with game of thrones theories about bran stark sheknows',
 'reddit is bursting with game of thrones theories about bran stark sheknows',
 'game of thrones season reddit theories that will rock your world elite daily',
 'what reddits game of thrones fans really thought of the season premiere usa today',
 'game of thrones star john bradley explains spoiler from the s premiere comics 0',
 'game of thrones star john bradley explains spoiler from the s premiere comics 0 0',
 'dark fantasy comic 0 arrives in time for game of thrones withdrawal comics 0 0',
 'how game of thrones should end',
 'game of thrones cast works in call center for kimmels game of phones',
 'how do you think the show is

In [21]:
intTweets, ISdic, SIdic = words2int(ppt)

In [23]:
intTweets[3]

[9345,
 8188,
 9106,
 5095,
 3970,
 11538,
 11037,
 19847,
 11538,
 17910,
 16511,
 17910,
 1173,
 5095,
 6687,
 2,
 10019,
 5095,
 11538,
 5831,
 16511,
 7449,
 13310,
 11538,
 11037,
 5095,
 14345,
 19826,
 1173,
 13310,
 5095,
 2,
 9106,
 3970,
 11538,
 19084,
 11037,
 11037,
 8188,
 6687,
 7449,
 16511,
 9106,
 8188,
 11037,
 6687,
 14345,
 19847,
 10019,
 1173,
 11538,
 9345,
 5095,
 19826,
 12008,
 9345,
 2,
 8188,
 11037,
 5095,
 3970]

In [None]:
def shouldKeepWord(word, sample, wordCount, corpusSize):
    n = wordCount[word]
    z = n/corpusSize
    p = (np.sqrt(z/sample) + 1) * sample/z
    
    r = np.random.uniform(0,1,1)
    if p > r:
        return True
    return False

In [38]:
def getTrainingData(processedText, vocabSize=vocabSize, windowSize=5, wordCount, corpusSize):
    X = []
    y = []
    # the position in the oneHotToken of a word will be it's integer representation
    # for example: word "table" -> SIdic['table'] = 15 -> oneHotTokens[15] represent the oneHotToken of table
    oneHotTokens = np.zeros((vocabSize, vocabSize))
    np.fill_diagonal(oneHotTokens, 1)
    for sentence in processedText:
        s = []
        
        # implementing subsampling of frequent words
        for word in sentence:
            if shouldKeepWord(word, 0.001, wordCount, corpusSize):
                s.append(word)
        
        for i in range(len(s)):
            centerWord = s[i]
            j = i
            while j < i + windowSize and j < len(sentence)-1:
                j += 1
                X.append(oneHotTokens[centerWord])
                y.append(oneHotTokens[sentence[j]])
            j = i
            while j > i - windowSize and j >= 1:
                j -= 1
                X.append(oneHotTokens[centerWord])
                y.append(oneHotTokens[sentence[j]])
        if len(X) > 10000000:
            break
    return X, y

Formula for table: $P(w_i) = \frac{f(w_i)^{3/4}}{\sum \limits _{j=0}^{n}(f(w_j)^{3/4})}$

In [None]:


def createTableForNegSampling(corpus, wordCount):
    # creating a 100M size table 
    table = np.array(100000000)
    
    probabilities = dict()
    totalWords = 0
    for word, count in wordCount:
        totalWords += count
        
    # calculating the probabilities for each word
    for word, count in wordCount:
        probabilites[word] = pow(count, 3/4)/ pow(totalWords, 3/4)
    
    
    # creating a dictionary that says how many indices in the table a certain word has
    nWordsOnTable = 
    
    return table

In [39]:
X, y = getTrainingData(intTweets)

In [49]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(output_dim=300, input_dim=vocabSize, activation= 'relu'))
model.add(Dense(output_dim=vocabSize, activation= 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X, y, epochs=100, batchSize=(len(X)//100))

array([0., 0., 0., ..., 0., 0., 0.])