In [5]:
from gensim import models,corpora,utils
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize 
from collections import defaultdict
from nltk.tokenize.treebank import TreebankWordDetokenizer
import spacy

inputFileName = 'NotebookData/deltaTweets.txt'
outputFileName = 'NotebookData/UberTopics.txt'
topicModelName = 'NotebookData/UberTopicModel.txt'
numberOfTopics = 20

tweets = open(inputFileName,"r")
stopwords = open('stopwords.txt',"r")
topicOfTweets = open(outputFileName,'w')
topicModel = open(topicModelName,"w")

mainList = tweets.read().splitlines()
tweetList = []
idList = []
for listItem in mainList:
    tweetList.append(listItem.split('|')[1])
    idList.append(listItem.split('|')[0])

nlp = spacy.load('en', disable=['parser', 'ner'])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]



#Remove Stopwords
stoplist = stopwords.read().splitlines() 
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in tweetList
]

tweetList = list(sent_to_words(texts))



# Build the bigram and trigram models
bigram = models.Phrases(tweetList, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = models.Phrases(bigram[tweetList], threshold=100)  

bigram_mod = models.phrases.Phraser(bigram)
trigram_mod = models.phrases.Phraser(trigram)




texts = make_bigrams(texts)

nlp = spacy.load('en', disable=['parser', 'ner'])


#Count frequency of each word
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

#Remove words that only appear once
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

#build dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

#create tfidf model of corpus
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

#create latent  Latent Dirichlet Allocation of tfidf model of corpus
lsi_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=numberOfTopics,per_word_topics=True) 

topicModel.write(str(lsi_model.print_topics()))
print(lsi_model.print_topics())
#loop through texts and query most fitting topic for each
matchedTopics = []
count = 0
for text in texts:
    detokenizedText = TreebankWordDetokenizer().detokenize(text)
    results = lsi_model[dictionary.doc2bow(text)]
    greatestMatchedTopicNumber = -1
    greatestTopicMatch = float(-1)
    for result in results:
        for element in result:
            index,score = element
            dog = str(score)
            if (type(score) is not list) and float(dog) > greatestTopicMatch and text:
                greatestMatchedTopicNumber = index
                greatestTopicMatch = score
    if(greatestTopicMatch > -1):
        topicOfTweets.write(idList[count]+'|'+ detokenizedText + '|' + str(greatestMatchedTopicNumber)+ '\n')
    count += 1


    




[(0, '0.007*"flight" + 0.007*"phone" + 0.006*"customer" + 0.006*"call" + 0.006*"suck" + 0.006*"worst" + 0.006*"1st" + 0.006*"service" + 0.005*"ever" + 0.005*"don\'t"'), (1, '0.011*"help" + 0.010*"flight?" + 0.008*"flight" + 0.008*"u" + 0.007*"fare" + 0.006*"love" + 0.006*"use" + 0.006*"request" + 0.005*"change" + 0.005*"time"'), (2, '0.012*"need" + 0.008*"help" + 0.008*"flight" + 0.007*"tell" + 0.007*"excited" + 0.006*"plane" + 0.006*"least" + 0.006*"get" + 0.006*"still" + 0.005*"would"'), (3, '0.006*"i️" + 0.005*"direct" + 0.005*"sit" + 0.005*"live" + 0.005*"flight" + 0.005*"-" + 0.005*"flying" + 0.005*"better" + 0.005*"mind" + 0.004*"allowed"'), (4, '0.012*"customer" + 0.012*"thank" + 0.011*"best" + 0.010*"service" + 0.007*"flight" + 0.007*"guys" + 0.006*"need" + 0.006*"what\'s" + 0.005*"bag" + 0.005*"please"'), (5, '0.007*"hour" + 0.007*"flight" + 0.007*"hold" + 0.006*"dl" + 0.006*"economy" + 0.006*"hr" + 0.006*"delay" + 0.005*"get" + 0.005*"2" + 0.005*"call"'), (6, '0.013*"hello" +