# Latent Dirilechlet Allocation

 Latent Dirichlet allocation (LDA), perhaps the most common topic model currently in use. Topic modeling is a type of statistical modeling for discovering the main topics in a collection of documents. The number of topics could be analyzed similar as a number of clusters.

### Data acquisition

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import nltk                                  
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import gensim
import os
import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [2]:
dataset= pd.read_csv('Tweets.csv', sep=',')
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
tweet_df = dataset[dataset['airline_sentiment'] != 'neutral'] #Removing the tweets associated with neutral reviews
tweet_df=tweet_df[['text','airline_sentiment']]
tweet_df.head()

Unnamed: 0,text,airline_sentiment
1,@VirginAmerica plus you've added commercials t...,positive
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
5,@VirginAmerica seriously would pay $30 a fligh...,negative
6,"@VirginAmerica yes, nearly every time I fly VX...",positive


### Preprocessing

In [4]:
tweet = tweet_df.text.to_list()

In [5]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [6]:
process_tweet(tweet[0])

['plu', 'ad', 'commerci', 'experi', '...', 'tacki']

In [7]:
text_data = []
for i in range(len(tweet)):
    text_data.append(process_tweet(tweet[i]))

In [8]:
text_data[0:3]

[['plu', 'ad', 'commerci', 'experi', '...', 'tacki'],
 ['realli',
  'aggress',
  'blast',
  'obnoxi',
  'entertain',
  'guest',
  'face',
  'littl',
  'recours'],
 ['realli', 'big', 'bad', 'thing']]

In [9]:
dictionary = gensim.corpora.Dictionary(text_data)

#### Bag of words -Gensim doc2bow


Filtering out the tokens that appear in less than 15 documents or in more of 0.5 of the documents. Keeping just the more freqient 100.000 tokens.

In [10]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [11]:
bow_corpus = [dictionary.doc2bow(doc) for doc in text_data]
bow_corpus[4310]

[(12, 1),
 (125, 1),
 (210, 1),
 (229, 1),
 (240, 1),
 (247, 1),
 (256, 1),
 (258, 1),
 (280, 1),
 (333, 1),
 (341, 2),
 (423, 1),
 (607, 1)]

In [12]:
bow_doc_4310 = bow_corpus[2210]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 11 ("fli") appears 1 time.
Word 44 ("think") appears 1 time.
Word 135 ("problem") appears 1 time.
Word 151 ("end") appears 1 time.
Word 159 ("airlin") appears 1 time.
Word 208 ("like") appears 1 time.
Word 476 ("continu") appears 1 time.
Word 533 ("resolut") appears 1 time.
Word 979 ("especi") appears 1 time.
Word 981 ("decid") appears 1 time.


### Topic Modeling

In [13]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=15)
for idx, topic in lda_model.print_topics(num_words=4):
    print('Topic: {} Words: {}'.format(idx, topic))

Topic: 0 Words: 0.076*"thank" + 0.018*"fli" + 0.016*"servic" + 0.016*"great"
Topic: 1 Words: 0.027*"plane" + 0.024*"bag" + 0.023*"time" + 0.020*"hour"
Topic: 2 Words: 0.038*"get" + 0.033*"call" + 0.031*"hold" + 0.030*"help"
Topic: 3 Words: 0.081*"flight" + 0.021*"go" + 0.021*"delay" + 0.016*"miss"
Topic: 4 Words: 0.101*"flight" + 0.063*"cancel" + 0.032*"custom" + 0.032*"flightl"


#### Checking the topic for one specific document

In [14]:
for index, score in sorted(lda_model[bow_corpus[2210]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9261821508407593	 
Topic: 0.076*"thank" + 0.018*"fli" + 0.016*"servic" + 0.016*"great" + 0.014*"custom" + 0.013*"guy" + 0.011*"airlin" + 0.011*"love" + 0.011*"flight" + 0.011*"much"

Score: 0.01867743395268917	 
Topic: 0.101*"flight" + 0.063*"cancel" + 0.032*"custom" + 0.032*"flightl" + 0.030*"servic" + 0.014*"airlin" + 0.012*"weather" + 0.010*"delay" + 0.009*"worst" + 0.008*"today"

Score: 0.01841619983315468	 
Topic: 0.081*"flight" + 0.021*"go" + 0.021*"delay" + 0.016*"miss" + 0.015*"late" + 0.014*"get" + 0.014*"i'm" + 0.013*"connect" + 0.011*"day" + 0.011*"seat"

Score: 0.01839832216501236	 
Topic: 0.027*"plane" + 0.024*"bag" + 0.023*"time" + 0.020*"hour" + 0.019*"gate" + 0.019*"wait" + 0.018*"flight" + 0.018*"delay" + 0.014*"..." + 0.014*"us"

Score: 0.018325895071029663	 
Topic: 0.038*"get" + 0.033*"call" + 0.031*"hold" + 0.030*"help" + 0.028*"hour" + 0.021*"flight" + 0.020*"2" + 0.019*"can't" + 0.019*"phone" + 0.017*"tri"


Comparing the scores, this document is highly associated with the first topic.

#### Obtaining the score for a new document

In [15]:
unseen_document = 'Waiting for the flight was a terrible experience.'

bow_vector = dictionary.doc2bow(process_tweet(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8341799378395081	 Topic: 0.101*"flight" + 0.063*"cancel" + 0.032*"custom" + 0.032*"flightl" + 0.030*"servic"
Score: 0.04300623759627342	 Topic: 0.027*"plane" + 0.024*"bag" + 0.023*"time" + 0.020*"hour" + 0.019*"gate"
Score: 0.0418250635266304	 Topic: 0.038*"get" + 0.033*"call" + 0.031*"hold" + 0.030*"help" + 0.028*"hour"
Score: 0.040550291538238525	 Topic: 0.076*"thank" + 0.018*"fli" + 0.016*"servic" + 0.016*"great" + 0.014*"custom"
Score: 0.04043850302696228	 Topic: 0.081*"flight" + 0.021*"go" + 0.021*"delay" + 0.016*"miss" + 0.015*"late"


### Topic Visualization

In order to have a better understanding regarding the relationships between the topics.

In [16]:
# Visualize the topics
num_topics=5
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

## References:
* https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d
* https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
* https://github.com/AprendizajeProfundo/Diplomado/blob/master/Temas/Módulo%208-%20Aprendizaje%20Profundo%20II/1.%20Procesamiento%20de%20Lenguaje%20natural/Cuadernos/nlp_Introduccion.ipynb
* https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0