In [8]:
from gensim import models,corpora,utils
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize 
from collections import defaultdict
from nltk.tokenize.treebank import TreebankWordDetokenizer

inputFileName = 'NotebookData/deltaTweets.txt'
outputFileName = 'NotebookData/DeltaTopics.txt'
topicModelName = 'NotebookData/DeltaTopicModel.txt'
numberOfTopics = 10

tweets = open(inputFileName,"r",encoding="utf-8")
stopwords = open('NotebookData/stopwords.txt',"r",encoding="utf-8")
topicOfTweets = open(outputFileName,'w',encoding="utf-8")
topicModel = open(topicModelName,"w",encoding="utf-8")

mainList = tweets.read().splitlines()
tweetList = []
idList = []
for listItem in mainList:
    tweetList.append(listItem.split('|')[1])
    idList.append(listItem.split('|')[0])



def sent_to_words(sentences):
    for sentence in sentences:
        yield(utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]



#Remove Stopwords
stoplist = stopwords.read().splitlines() 
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in tweetList
]

tweetList = list(sent_to_words(texts))



# Build the bigram and trigram models
bigram = models.Phrases(tweetList, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = models.Phrases(bigram[tweetList], threshold=100)  

bigram_mod = models.phrases.Phraser(bigram)
trigram_mod = models.phrases.Phraser(trigram)




texts = make_bigrams(texts)




#Count frequency of each word
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

#Remove words that only appear once
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

#build dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

#create tfidf model of corpus
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

#create latent  Latent Dirichlet Allocation of tfidf model of corpus
lsi_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=numberOfTopics,per_word_topics=True) 

topicModel.write(str(lsi_model.print_topics()))
print(lsi_model.print_topics())
#loop through texts and query most fitting topic for each
matchedTopics = []
count = 0
for text in texts:
    detokenizedText = TreebankWordDetokenizer().detokenize(text)
    results = lsi_model[dictionary.doc2bow(text)]
    greatestMatchedTopicNumber = -1
    greatestTopicMatch = float(-1)
    for result in results:
        for element in result:
            index,score = element
            dog = str(score)
            if (type(score) is not list) and float(dog) > greatestTopicMatch and text:
                greatestMatchedTopicNumber = index
                greatestTopicMatch = score
    if(greatestTopicMatch > -1):
        topicOfTweets.write(idList[count]+'|'+ detokenizedText + '|' + str(greatestMatchedTopicNumber)+ '\n')
    count += 1

print('finished')
    




[(0, '0.008*"flight" + 0.005*"customer" + 0.005*"get" + 0.004*"time" + 0.004*"help" + 0.004*"sitting" + 0.003*"service" + 0.003*"2" + 0.003*"hour" + 0.003*"-"'), (1, '0.009*"help" + 0.006*"need" + 0.006*"flight" + 0.004*"got" + 0.004*"due" + 0.004*"take" + 0.004*"bag" + 0.004*"long" + 0.004*"flights" + 0.004*"change"'), (2, '0.006*"dm" + 0.006*"flight" + 0.005*"please" + 0.005*"delayed" + 0.005*"flight." + 0.004*"thank" + 0.004*"help" + 0.004*"customer" + 0.004*"need" + 0.004*"great"'), (3, '0.010*"flight" + 0.005*"delta" + 0.005*"thanks" + 0.004*"need" + 0.004*"get" + 0.003*"want" + 0.003*"i️" + 0.003*"upgrade" + 0.003*"thank" + 0.003*"help"'), (4, '0.008*"flight" + 0.005*"please" + 0.004*"get" + 0.004*"service" + 0.004*"hello" + 0.004*"great" + 0.004*"help?" + 0.004*"delta" + 0.004*"gate" + 0.004*"help"'), (5, '0.007*"flight" + 0.007*"delay" + 0.005*"free" + 0.004*"miles" + 0.004*"booked" + 0.004*"need" + 0.004*"san" + 0.004*"thanks" + 0.004*"getting" + 0.004*"time"'), (6, '0.007*"fl