In [22]:
import json

f = open('christmas-en-index-ready.txt', 'r')
data = json.loads(f.read())
f.close()
print "Loaded ", len(data), " tweets"

Loaded  100001  tweets


In [23]:
import nltk
from nltk.tokenize import TweetTokenizer
import re
import gensim
from time import time
from gensim import corpora, models, similarities
import numpy as np
import pandas as pd

class TweetTopicExtractor:
    def __init__(self, data):
        self.raw_data = data
        self.documents = None
        self.document_corpus = None
        self.lda_model = None
        self.doc2vec_model = None
        self.gensim_dict = None
    
    def __cleanTokenize(self):
        tweet_tokenizer = TweetTokenizer()
        stopwords = nltk.corpus.stopwords.words("english")
        for tweet in self.raw_data:
            text_arr = tweet_tokenizer.tokenize(tweet["tweet_text"])
            text_arr = [word for word in text_arr if word not in stopwords and re.search('[a-zA-Z]', word)]
            tweet["tweet_text"] = text_arr
    
    def __prepareDocuments(self):
        documents = []
        corp = []
        LabeledSentence = gensim.models.doc2vec.LabeledSentence
        for tweet in self.raw_data:
            documents.append(LabeledSentence(tweet["tweet_text"], [tweet["tweet_id"]]))
            corp.append(tweet["tweet_text"])
        self.documents = documents
        self.document_corpus = corp
    
    def __getBagOfWords(self):
        self.gensim_dict = corpora.Dictionary(self.document_corpus)
        self.gensim_dict.filter_extremes(no_below=1, no_above=0.8)
        return [self.gensim_dict.doc2bow(text) for text in self.document_corpus]
    
    def preprocess(self):
        t_pp = time()
        self.__cleanTokenize()
        self.__prepareDocuments()
        print "Cleaned and loaded ", len(self.documents), " documents (tweets) in ", round(time()-t_pp,2)," seconds"
    
    def buildLda(self, num_topics=4, num_passes=10):
        t_lda = time()
        bow_corpus = self.__getBagOfWords()
        self.lda_model = models.LdaModel(bow_corpus, num_topics=num_topics, id2word=self.gensim_dict, update_every=5, chunksize=100010, passes=num_passes)
        print "Built LDA in ", round(time()-t_lda,2)," seconds"
    
    def printLdaSummary(self):
        print ""
        print "Topics: Words that define the topic + probability with which word contributes to topic"
        print self.lda_model.show_topics()
        
        print ""
        print "Topics: Just words that define the topic"
        topics_matrix = self.lda_model.show_topics(formatted=False, num_words=20)
        topics_matrix = np.array(topics_matrix)

        topic_words = topics_matrix[:,:,1]
        for i in topic_words:
            print [str(word) for word in i]
            print ""
        
        print ""
        print "thisObject.lda_model.xxxx() to explore further"
    
    def buildDoc2Vec(self, alpha=0.03, min_alpha=0.03, num_passes=10):
        t_dv = time()
        self.doc2vec_model = models.Doc2Vec(alpha=alpha, min_alpha=min_alpha)
        self.doc2vec_model.build_vocab(self.documents)
        for i in range(num_passes):
            self.doc2vec_model.train(self.documents)
        print ""
        print "Built Doc2Vec in ",round(time()-t_dv,2), " seconds"
    
    def printDoc2VecSummary(self):
        print ""
        print "Example: A tweet most similar to '#Xmas' can be found like this ->"
        print self.doc2vec_model.most_similar("#Xmas")
        print ""
        print "thisObject.doc2vec_model.xxxx() to explore further"
    
    def saveModels(self, path_to_lda='cooper.lda', path_to_doc2vec='cooper.doc2vec'):
        self.lda_model.save(path_to_lda)
        self.doc2vec_model.save(path_to_doc2vec)
        print "Saved"
    
    def loadModels(self, path_to_lda='cooper.lda', path_to_doc2vec='cooper.doc2vec'):
        self.lda_model = LdaModel.load(path_to_lda, mmap='r')
        self.doc2vec_model = Doc2Vec.load(path_to_doc2vec, mmap='r')
        print "Loaded"

In [24]:
tte = TweetTopicExtractor(data)
tte.preprocess()

Cleaned and loaded  100001  documents (tweets) in  10.31  seconds


In [25]:
tte.buildDoc2Vec()
tte.printDoc2VecSummary()


Built Doc2Vec in  178.33  seconds

Example: A tweet most similar to '#Xmas' can be found like this ->
[(u'#Glamping', 0.2964997887611389), (u'#ShortStory', 0.28959375619888306), (u'#Workstations', 0.26019182801246643), (u'#Nuneaton', 0.2574554681777954), (u'#retaildesign', 0.25578734278678894), (u'#TRUMP2016', 0.25165659189224243), (u'#EJwtt', 0.2496497929096222), (u'Teens', 0.24932417273521423), (u'#GADGETS', 0.2442663013935089), (u'#planner', 0.23789329826831818)]

thisObject.doc2vec_model.xxxx() to explore further


In [26]:
tte.buildLda()
tte.printLdaSummary()

Built LDA in  1767.8  seconds

Topics: Words that define the topic + probability with which word contributes to topic
[(0, u'0.081*"#Christmas" + 0.011*"Christmas" + 0.009*"The" + 0.007*"#christmas" + 0.007*"time" + 0.006*"#Win" + 0.006*"#giveaway" + 0.005*"year" + 0.004*"It\'s" + 0.004*"new"'), (1, u'0.051*"#Christmas" + 0.045*"#christmas" + 0.019*"#christmascookies" + 0.019*"#christmascrush" + 0.019*"#christmasmatch3" + 0.019*"#santa\'schimney" + 0.019*"#sweeper" + 0.018*"December" + 0.011*"1st" + 0.010*"Christmas"'), (2, u'0.045*"#Christmas" + 0.042*"#christmas" + 0.031*"Christmas" + 0.009*"like" + 0.007*"It\'s" + 0.007*"look" + 0.006*"I" + 0.006*"Days" + 0.006*"lot" + 0.005*"beginning"'), (3, u'0.057*"#Christmas" + 0.023*"#christmas" + 0.010*"Christmas" + 0.005*"via" + 0.005*"The" + 0.004*"gift" + 0.004*"#gifts" + 0.004*"#gift" + 0.004*"#Xmas" + 0.004*"#Gifts"')]

Topics: Just words that define the topic


ValueError: setting an array element with a sequence

In [27]:
tte.saveModels()

Saved
