In [13]:
import json

file_name = 'delta'

f = open(file_name+'-en-index-ready.txt', 'r')
data = json.loads(f.read())
f.close()
print "Loaded ", len(data), " tweets"

Loaded  271644  tweets


In [14]:
import nltk
from nltk.tokenize import TweetTokenizer
import re
import gensim
from time import time
from gensim import corpora, models, similarities
import numpy as np
import pandas as pd
import multiprocessing

class TweetTopicExtractor:
    def __init__(self, data):
        self.raw_data = data
        self.documents = None
        self.document_corpus = None
        self.lda_model = None
        self.doc2vec_model = None
        self.gensim_dict = None
    
    def __cleanTokenize(self):
        tweet_tokenizer = TweetTokenizer()
        stopwords = nltk.corpus.stopwords.words("english")
        for tweet in self.raw_data:
            text_arr = tweet_tokenizer.tokenize(tweet["tweet_text"])
            text_arr = [word.lower() for word in text_arr if word.lower() not in stopwords and re.search('[a-zA-Z]', word)]
            tweet["tweet_text"] = text_arr
    
    def __prepareDocuments(self):
        documents = []
        corp = []
        LabeledSentence = gensim.models.doc2vec.LabeledSentence
        for tweet in self.raw_data:
            documents.append(LabeledSentence(tweet["tweet_text"], [tweet["tweet_id"]]))
            corp.append(tweet["tweet_text"])
        self.documents = documents
        self.document_corpus = corp
    
    def __getBagOfWords(self):
        self.gensim_dict = corpora.Dictionary(self.document_corpus)
        self.gensim_dict.filter_extremes(no_below=1, no_above=0.8)
        return [self.gensim_dict.doc2bow(text) for text in self.document_corpus]
    
    def preprocess(self):
        t_pp = time()
        self.__cleanTokenize()
        self.__prepareDocuments()
        print "Cleaned and loaded ", len(self.documents), " documents (tweets) in ", round(time()-t_pp,2)," seconds"
    
    def buildLda(self, num_topics=4, num_passes=10):
        t_lda = time()
        bow_corpus = self.__getBagOfWords()
        self.lda_model = models.LdaModel(bow_corpus, num_topics=num_topics, id2word=self.gensim_dict, update_every=5, chunksize=100010, passes=num_passes)
        print "Built LDA in ", round(time()-t_lda,2)," seconds"
    
    def printLdaSummary(self):
        print ""
        print "Topics: Words that define the topic + probability with which word contributes to topic"
        print self.lda_model.show_topics()
        
        print ""
        print "Topics: Just words that define the topic"
        '''
        topics_matrix = self.lda_model.show_topics(formatted=False, num_words=20)
        topics_matrix = np.array(topics_matrix)

        topic_words = topics_matrix[:,:,1]
        for i in topic_words:
            print [str(word) for word in i]
            print ""
        '''
        print ""
        print "thisObject.lda_model.xxxx() to explore further"
    
    def buildDoc2Vec(self, num_passes=10, parallel=True):
        cores = 0
        if parallel:
            cores = multiprocessing.cpu_count()
        t_dv = time()
        self.doc2vec_model = models.Doc2Vec(size=100, window=10, dm=0, dbow_words=1, workers=cores, min_count=1, negative=5)
        self.doc2vec_model.build_vocab(self.documents)
        for i in range(num_passes):
            self.doc2vec_model.train(self.documents)
            if i%10 == 0:
                print "Iteration ",i
        print ""
        print "Built Doc2Vec in ",round(time()-t_dv,2), " seconds"
    
    def printDoc2VecSummary(self):
        print ""
        print "Example: A tweet most similar to '#xmas' can be found like this ->"
        print self.doc2vec_model.most_similar("#xmas")
        print ""
        print "thisObject.doc2vec_model.xxxx() to explore further"
    
    def saveLdaModel(self, path_to_lda='cooper.lda'):
        self.lda_model.save(path_to_lda)
        print "Saved"
    
    def saveDoc2VecModel(self, path_to_doc2vec='cooper.doc2vec'):
        self.doc2vec_model.save(path_to_doc2vec)
        print "Saved"
    
    def loadLdaModel(self, path_to_lda='cooper.lda'):
        self.lda_model = models.LdaModel.load(path_to_lda, mmap='r')
        print "Loaded"
    
    def loadDoc2VecModel(self, path_to_doc2vec='cooper.doc2vec'):
        self.doc2vec_model = models.Doc2Vec.load(path_to_doc2vec, mmap='r')
        print "Loaded"

In [10]:
tte = TweetTopicExtractor(data)
tte.preprocess()

Cleaned and loaded  100001  documents (tweets) in  10.28  seconds


In [11]:
tte.buildDoc2Vec(num_passes=100)
tte.printDoc2VecSummary()


Built Doc2Vec in  1735.08  seconds

Example: A tweet most similar to '#Xmas' can be found like this ->


KeyError: "word '#Xmas' not in vocabulary"

In [5]:
tte.buildLda(num_topics=10)
tte.printLdaSummary()

Built LDA in  2065.32  seconds

Topics: Words that define the topic + probability with which word contributes to topic
[(0, u'0.048*"#christmas" + 0.033*"#Christmas" + 0.018*"Days" + 0.018*"Christmas" + 0.010*"one" + 0.009*"time" + 0.008*"Get" + 0.008*"I\'ve" + 0.007*"involved" + 0.007*"entered"'), (1, u'0.092*"#Christmas" + 0.020*"December" + 0.013*"1st" + 0.010*"Christmas" + 0.010*"time" + 0.009*"It\'s" + 0.008*"Happy" + 0.008*"us" + 0.007*"The" + 0.007*"#December"'), (2, u'0.089*"#christmas" + 0.073*"#christmascookies" + 0.073*"#santa\'schimney" + 0.073*"#sweeper" + 0.073*"#christmascrush" + 0.073*"#christmasmatch3" + 0.021*"#Christmas" + 0.006*"Christmas" + 0.002*"December" + 0.002*"The"'), (3, u'0.074*"#christmas" + 0.015*"#Christmas" + 0.012*"The" + 0.012*"Christmas" + 0.010*"#Xmas" + 0.010*"#sale" + 0.007*"#december" + 0.006*"one" + 0.006*"amazing" + 0.005*"Share"'), (4, u'0.076*"#christmas" + 0.036*"Christmas" + 0.015*"#Christmas" + 0.008*"#Xmas" + 0.008*"#sale" + 0.006*"Decemb

In [6]:
tte.saveModels()

Saved


In [12]:
print tte.doc2vec_model.most_similar('#xmas')

[(u'know', 0.42546698451042175), (u'amazing', 0.4127165675163269), (u'right', 0.3982900083065033), (u'#disney', 0.33922475576400757), (u'#crafts', 0.3272498846054077), (u'#thanksgiving', 0.3166525661945343), (u'#decorations', 0.30977874994277954), (u'special', 0.3092844784259796), (u'almonds', 0.3067658543586731), (u'excited', 0.30661681294441223)]


In [9]:
x = TweetTopicExtractor(data)

In [10]:
x.loadModels()

Loaded


In [15]:
tte = TweetTopicExtractor(data)
tte.preprocess()

Cleaned and loaded  271644  documents (tweets) in  30.22  seconds


In [16]:
tte.buildDoc2Vec(num_passes=50)

Iteration  0
Iteration  10
Iteration  20
Iteration  30
Iteration  40

Built Doc2Vec in  5562.25  seconds


In [17]:
tte.saveDoc2VecModel('delta.cooper')

Saved


In [18]:
import numpy as np
import gzip
import cPickle

g = open(file_name+'-en-index-ready.txt','r')
tweets = json.loads(g.read())
g.close()

vectors = []
labels = []

for t in tweets:
    v = tte.doc2vec_model.docvecs[t['tweet_id']]
    vectors.append(v)
    labels.append(t['tweet_id'])
vectors = np.asarray(vectors)
labels = np.asarray(labels)

print "Vectors loaded: ",vectors.shape," and ",labels.shape

g = open(file_name+'-vectors.pkl','wb')
cPickle.dump((vectors,labels),g)
g.close()
print "Done"

Vectors loaded:  (271644, 100)  and  (271644,)
Done
