## Topicmodelling
Based on: https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/tweets_public.csv", encoding = "utf-8", sep = ",")

In [None]:
# do not run
from nltk.corpus import stopwords

stop = stopwords.words('english')
df["text"] = df["text"].apply(lambda x: x.lower()) # get lower
#df["text"] = df["text"].apply(lambda x: re.sub(" +", " ", x)) # remove extra whitespaces
#df["text"] = df["text"].apply(remove_whitespace) 
df["text_2"] = df["text"].apply(lambda x: re.sub("[^a-zA-Z]+", " ", x)) # remove everything not alphabetical characters
df["text_3"] = \
    df['text_2'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # remove stopwords

In [9]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [7]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /home/borbota/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/borbota/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [12]:
import random
text_data = []
for line in df["text"]:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        text_data.append(tokens)

['SCREEN_NAME', 'learn', 'flight', 'cancel', "flightled.can't", 'phone', 'option', 'assistance', 'online', 'suggestion']
['SCREEN_NAME', 'price']
['SCREEN_NAME', 'flight', 'worst', 'experience', 'flight', 'regret', 'opening', 'mile', 'hotterandlongerthanhell']
['SCREEN_NAME', 'thank']
['SCREEN_NAME', 'complete', 'respect', 'call', 'daughter']
['SCREEN_NAME', 'understand', 'still', 'reservation', 'online']
['SCREEN_NAME', 'hours', 'already']
['SCREEN_NAME', 'earlier', 'flight', 'sap-&gt;mia', '03.03', 'currently', 'book', 'flight']
['SCREEN_NAME', 'course', 'airport', 'looking', 'claim', 'still']
['.@americanair', 'phone', 'reservation', 'system', 'automatically', 'disconnect']
['SCREEN_NAME', 'turn', 'thanks', 'useless', 'centre', 'useless', 'website']
['SCREEN_NAME', 'today', 'issue', 'clear', 'beautifully', 'thank', 'check']
['SCREEN_NAME', 'final', 'connection', 'leave', 'npbhd0']
['SCREEN_NAME', 'thnkx', 'charge', 'neveragain']
['SCREEN_NAME', 'flight', 'tomorrow', 'cancel', 'fligh

In [15]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [24]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=7)
for topic in topics:
    print(topic)

(0, '0.068*"SCREEN_NAME" + 0.030*"flight" + 0.011*"plane" + 0.011*"program" + 0.011*"charge" + 0.011*"phone" + 0.011*"flightled"')
(1, '0.038*"SCREEN_NAME" + 0.014*"flying" + 0.014*"times" + 0.014*"call" + 0.014*"tomorrow" + 0.014*"rebook" + 0.014*"taking"')
(2, '0.109*"SCREEN_NAME" + 0.028*"flight" + 0.026*"cancel" + 0.021*"still" + 0.016*"flying" + 0.016*"denver" + 0.016*"today"')
(3, '0.121*"SCREEN_NAME" + 0.038*"flight" + 0.026*"service" + 0.014*"airline" + 0.012*"customer" + 0.012*"flyer" + 0.012*"bring"')
(4, '0.150*"SCREEN_NAME" + 0.021*"flight" + 0.019*"delay" + 0.016*"thank" + 0.016*"phone" + 0.016*"try" + 0.012*"waiting"')


In [22]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
