In [6]:
from gensim import models,corpora,utils
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize 
from collections import defaultdict
from nltk.tokenize.treebank import TreebankWordDetokenizer
import spacy

inputFileName = 'NotebookData/deltaTweets.txt'
outputFileName = 'NotebookData/DeltaTopics.txt'
topicModelName = 'NotebookData/DeltaTopicModel.txt'
numberOfTopics = 10

tweets = open(inputFileName,"r")
stopwords = open('NotebookData/stopwords.txt',"r")
topicOfTweets = open(outputFileName,'w')
topicModel = open(topicModelName,"w")

mainList = tweets.read().splitlines()
tweetList = []
idList = []
for listItem in mainList:
    tweetList.append(listItem.split('|')[1])
    idList.append(listItem.split('|')[0])

nlp = spacy.load('en', disable=['parser', 'ner'])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]



#Remove Stopwords
stoplist = stopwords.read().splitlines() 
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in tweetList
]

tweetList = list(sent_to_words(texts))



# Build the bigram and trigram models
bigram = models.Phrases(tweetList, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = models.Phrases(bigram[tweetList], threshold=100)  

bigram_mod = models.phrases.Phraser(bigram)
trigram_mod = models.phrases.Phraser(trigram)




texts = make_bigrams(texts)

nlp = spacy.load('en', disable=['parser', 'ner'])


#Count frequency of each word
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

#Remove words that only appear once
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

#build dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

#create tfidf model of corpus
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

#create latent  Latent Dirichlet Allocation of tfidf model of corpus
lsi_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=numberOfTopics,per_word_topics=True) 

topicModel.write(str(lsi_model.print_topics()))
print(lsi_model.print_topics())
#loop through texts and query most fitting topic for each
matchedTopics = []
count = 0
for text in texts:
    detokenizedText = TreebankWordDetokenizer().detokenize(text)
    results = lsi_model[dictionary.doc2bow(text)]
    greatestMatchedTopicNumber = -1
    greatestTopicMatch = float(-1)
    for result in results:
        for element in result:
            index,score = element
            dog = str(score)
            if (type(score) is not list) and float(dog) > greatestTopicMatch and text:
                greatestMatchedTopicNumber = index
                greatestTopicMatch = score
    if(greatestTopicMatch > -1):
        topicOfTweets.write(idList[count]+'|'+ detokenizedText + '|' + str(greatestMatchedTopicNumber)+ '\n')
    count += 1

print('finished')
    




[(0, '0.009*"flight" + 0.005*"need" + 0.005*"thank" + 0.005*"best" + 0.005*"great" + 0.004*"i’m" + 0.004*"customer" + 0.004*"care" + 0.004*"help" + 0.004*"call"'), (1, '0.005*"destination" + 0.005*"flight" + 0.005*"-" + 0.005*"mechanical" + 0.004*"booked" + 0.004*"figure" + 0.004*"status" + 0.004*"please" + 0.004*"hey" + 0.003*"i\'m"'), (2, '0.008*"flight" + 0.007*"need" + 0.007*"help" + 0.005*"one" + 0.005*"u" + 0.005*"follow" + 0.004*"thank" + 0.004*"please" + 0.004*"get" + 0.004*"thanks"'), (3, '0.006*"hello" + 0.006*"flight" + 0.005*"app" + 0.005*"delta" + 0.004*"missed" + 0.003*"customer" + 0.003*"-" + 0.003*"service" + 0.003*"connection" + 0.003*"working"'), (4, '0.007*"flight" + 0.005*"get" + 0.004*"2" + 0.004*"hour" + 0.004*"call" + 0.004*"@" + 0.004*"seat" + 0.004*"change" + 0.004*"going" + 0.004*"help"'), (5, '0.009*"flight" + 0.005*"dl" + 0.005*"delta" + 0.004*"thanks" + 0.004*"check" + 0.003*"-" + 0.003*"lost" + 0.003*"months" + 0.003*"bag" + 0.003*"emergency"'), (6, '0.007