In [1]:
#data manipulation
import numpy as np
import pandas as pd
import tqdm
#file and system operations
import os
import sys
assert sys.version_info >= (3,5)
#visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#consistent sized plots
from pylab import rcParams
rcParams['figure.figsize']=12,5
rcParams['axes.labelsize']=12
rcParams['ytick.labelsize']=12
rcParams['xtick.labelsize']=12
#handle unwanted warnings 
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)
#view all the columns
pd.options.display.max_columns = None
#basic text manipulation libraries
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer


import gensim
#plotting tools
import pyLDAvis
#import pyLDAvis.gensim #dont skip this
import matplotlib.pyplot as plt
%matplotlib inline

  from imp import reload


Load Data


In [2]:
neg_reviews = pd.read_csv('data/neg_reviews.csv')
neg_reviews.head()


Unnamed: 0,reviews,tokens,sentiment
0,BA cancelled my flight home to Heathrow on Dec...,"['cancelled', 'home', 'heathrow', 'dec', '19th...",0
1,"BA cancelled my flight home, the last flight o...","['cancelled', 'home', 'last', 'day', 'heathrow...",0
2,"Turned up 3.5 hours in advance, Terminal 5 at ...","['turned', '3.5', 'hours', 'advance', 'termina...",0
3,Boarding – at gate at LGW they called Group 1 ...,"['boarding', '–', 'gate', 'lgw', 'called', 'gr...",0
4,Missing baggage customer service was the worst...,"['missing', 'baggage', 'customer', 'service', ...",0


In [3]:
neg_reviews.shape

(1488, 3)

In [4]:
'''
Write a function to proprocess the entire dataset 
'''
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    '''This function will lemmatize on Noun POS and stem the text'''
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='n'))
    #return (WordNetLemmatizer().lemmatize(text,pos='n'))

#tokenize and lemmatize
def preprocess(text):
    '''Function to break into word tokens, remove stopwords, remove short words and finally to lemmatize and stem the individual tokens'''
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [5]:
#check for a sample review
result = preprocess('The worst camera I have ever seen. Even my very old configuration mobile phone had a better camera resolution. Battery draining faster.')
print(result)

['worst', 'camera', 'seen', 'configur', 'mobil', 'phone', 'better', 'camera', 'resolut', 'batteri', 'drain', 'faster']


In [6]:
processed_docs = []

#uncomment below line to find the topics for a particular sentiment
#reviews = reviews[reviews['sentiment']==1]

for doc in neg_reviews['reviews']:
    processed_docs.append(preprocess(doc))

In [7]:
'''
Preview the processed documents
'''
print(processed_docs[:10])

[['cancel', 'flight', 'home', 'heathrow', 'face', 'sensibl', 'weather', 'iceland', 'appal', 'accept', 'flight', 'cancel', 'hour', 'time', 'time', 'struggl', 'dread', 'drive', 'condit', 'airport', 'near', 'condit', 'taken', 'advic', 'spare', 'need', 'travel', 'road', 'end', 'close', 'book', 'flight', 'downgrad', 'sin', 'have', 'check', 'today', 'type', 'cancel', 'flight', 'despit', 'road', 'keflavík', 'close', 'check', 'thing', 'help', 'offer'], ['cancel', 'flight', 'home', 'flight', 'heathrow', 'tri', 'push', 'flight', 'london', 'citi', 'hour', 'later', 'heathrow', 'want', 'cross', 'london', 'late', 'night', 'public', 'transport', 'luggag', 'especi', 'rail', 'strike', 'announc', 'time', 'book', 'flight', 'home', 'lufthansa', 'cost', 'economi', 'seat', 'home', 'heathrow', 'termin', 'friday', 'night', 'home', 'famili', 'news', 'refund', 'month', 'shame', 'fli', 'pleasur', 'experi', 'cancel', 'move', 'journey', 'book', 'year', 'regrett', 'airlin', 'better', 'reliabl', 'custom', 'servic'],

### bag of words on the dataset

In [8]:
'''
Create a dictionary of the words which appear in the entire corpus
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [9]:
dictionary.keys()[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [10]:
#print a few words in the dictionary
count = 0
for k,v in dictionary.iteritems():
    print(k,v)
    count = count + 1
    if count > 10:
        break

0 accept
1 advic
2 airport
3 appal
4 book
5 cancel
6 check
7 close
8 condit
9 despit
10 downgrad



Gensim filter extremes

    Remove or filter the words that appear less than nobelow
    Remove or filter the words that apepar more than noabove (fraction)
    After the above two steps keep only the n most frequent tokens or keep all



In [11]:
dictionary.filter_extremes(no_below=5,no_above=0.1,keep_n=None)


Gensim doc2bow

    Create a bag of words for each document ie for each document we create a dictionary reporting how many words and how many times those words appear



In [12]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [13]:
bow_corpus[10:20]

[[(23, 1), (288, 1), (298, 1), (299, 1), (300, 1)],
 [(71, 1),
  (257, 1),
  (258, 1),
  (301, 1),
  (302, 1),
  (303, 1),
  (304, 1),
  (305, 1),
  (306, 1),
  (307, 1),
  (308, 1)],
 [(50, 3),
  (64, 1),
  (66, 3),
  (67, 1),
  (103, 1),
  (111, 1),
  (116, 1),
  (119, 1),
  (123, 1),
  (173, 1),
  (212, 1),
  (223, 1),
  (266, 1),
  (309, 1),
  (310, 1),
  (311, 1),
  (312, 2),
  (313, 1),
  (314, 1),
  (315, 1),
  (316, 1),
  (317, 1),
  (318, 1),
  (319, 1),
  (320, 1),
  (321, 1),
  (322, 2),
  (323, 4),
  (324, 1),
  (325, 2),
  (326, 1),
  (327, 1),
  (328, 1),
  (329, 1),
  (330, 1),
  (331, 1),
  (332, 1),
  (333, 5)],
 [(16, 1),
  (91, 1),
  (125, 1),
  (144, 2),
  (165, 1),
  (182, 2),
  (191, 1),
  (203, 1),
  (244, 1),
  (290, 3),
  (334, 1),
  (335, 1),
  (336, 2),
  (337, 1),
  (338, 1),
  (339, 1),
  (340, 1),
  (341, 1),
  (342, 1),
  (343, 1),
  (344, 1),
  (345, 1),
  (346, 1)],
 [(68, 1),
  (271, 2),
  (273, 1),
  (347, 1),
  (348, 1),
  (349, 1),
  (350, 2),
  (35

In [14]:
random = np.random.randint(1,len(neg_reviews))
print(neg_reviews['reviews'][random])

London Heathrow to Cape Town with British Airways. Paid extra to book 2 particular seats. Both were broken. I had to sit upright for 11+hours and my husband had to sit partially reclined to eat his meals. I had veggie meal which was served half hour before everyone else - no drink until everyone else served. Choice of meal for husband was chicken curry or chicken curry. He doesn't like curry so it was whispered to him that he could have a veggie meal. Same at breakfast - served half hour early so had to eat alone. When other breakfasts came out, guess what no meat. On complaining to BA Customer Services, told could choose another meal if we had gone on line and paid extra. Been offered the insulting and derisory compensation of £20 off our next flight (as if there will be a next flight) - not even offered a refund of payment we made for booking broken seat. Both airline and customer service disgraceful.


In [15]:
document_num = random
bow_doc_x = bow_corpus[random]

In [16]:
for i in range(len(bow_doc_x)):
    print(f'Word {bow_doc_x[i][0]} {dictionary[bow_doc_x[i][0]]} appears {bow_doc_x[i][1]} times')

Word 36 refund appears 1 times
Word 49 compens appears 1 times
Word 59 gone appears 1 times
Word 64 line appears 1 times
Word 94 came appears 1 times
Word 108 half appears 2 times
Word 215 particular appears 1 times
Word 256 choos appears 1 times
Word 359 reclin appears 1 times
Word 387 cape appears 1 times
Word 411 town appears 1 times
Word 445 disgrac appears 1 times
Word 573 broken appears 2 times
Word 747 complain appears 1 times
Word 749 husband appears 2 times
Word 938 guess appears 1 times
Word 1036 earli appears 1 times
Word 1062 payment appears 1 times
Word 1185 chicken appears 2 times
Word 1524 insult appears 1 times
Word 1737 meat appears 1 times
Word 1793 upright appears 1 times
Word 1856 curri appears 3 times
Word 1879 veggi appears 2 times


Running LDA using Bag of Words

In [17]:
#apply the gensim LDA model and generate 12 topics from the corpus
seed = 41
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,num_topics=12,id2word=dictionary,passes=10,workers=2,
                                      random_state=seed,minimum_probability=0.05,alpha='symmetric')

In [18]:
'''
For each topic, explore each word and its relative weight in the topic
'''

for idx,topic in lda_model.print_topics(-1,num_words=15):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.008*"bag" + 0.006*"fast" + 0.006*"haul" + 0.005*"attend" + 0.005*"connect" + 0.005*"issu" + 0.004*"final" + 0.004*"dinner" + 0.004*"hotel" + 0.004*"dublin" + 0.004*"night" + 0.004*"option" + 0.004*"aw" + 0.004*"sleep" + 0.004*"product"


Topic: 1 
Words: 0.007*"think" + 0.007*"nice" + 0.007*"main" + 0.006*"lunch" + 0.006*"chicken" + 0.006*"option" + 0.006*"flown" + 0.006*"money" + 0.005*"go" + 0.005*"rout" + 0.005*"sandwich" + 0.005*"wife" + 0.005*"beef" + 0.005*"cut" + 0.005*"start"


Topic: 2 
Words: 0.031*"refund" + 0.020*"voucher" + 0.017*"email" + 0.013*"phone" + 0.012*"week" + 0.011*"onlin" + 0.010*"receiv" + 0.009*"month" + 0.008*"sent" + 0.008*"issu" + 0.008*"money" + 0.007*"contact" + 0.007*"number" + 0.007*"websit" + 0.007*"rebook"


Topic: 3 
Words: 0.011*"baggag" + 0.011*"connect" + 0.008*"miss" + 0.008*"close" + 0.007*"hotel" + 0.007*"point" + 0.006*"desk" + 0.006*"compens" + 0.006*"bag" + 0.006*"result" + 0.005*"secur" + 0.005*"understand" + 0.005*"walk

In [19]:
#import Coherence model from gensim
from gensim.models import CoherenceModel
#compute coherence score
lda_model_coherence = CoherenceModel(model=lda_model,texts=processed_docs,dictionary=dictionary,
                                    coherence='c_v')
coherence_lda = lda_model_coherence.get_coherence()
print('\nCoherence Score:',coherence_lda)






Coherence Score: 0.2643318667918662


In [20]:
#define a helper function
def compute_coherence_score(corpus,dictionary,k,a):
    #instantiate the model instance based on k,a and b
    lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=dictionary,num_topics=k,alpha=a,
                                           passes=10,
                                           random_state=seed)
    lda_model_coherence = CoherenceModel(model=lda_model,texts=processed_docs,dictionary=dictionary,coherence='c_v')
    return lda_model_coherence.get_coherence()

In [21]:
#search for the best alpha and the number of topics --> one with the highest coherence score will be the best hyperparameter
alpha =['symmetric','asymmetric']

for x in alpha:
    print('Coherence Model with alpha = {}'.format(x))
    print('-------------------------------------------')
    print('\n')
    for i in range(5,10):
        score = compute_coherence_score(corpus=bow_corpus,dictionary=dictionary,k=i,a=x)        
        print(f'Coherence score with {i} topics is {score}')
        print('\n')

Coherence Model with alpha = symmetric
-------------------------------------------


Coherence score with 5 topics is 0.34536201276023315


Coherence score with 6 topics is 0.32535912395126915


Coherence score with 7 topics is 0.3206426167084266


Coherence score with 8 topics is 0.3191085079710594


Coherence score with 9 topics is 0.28833318417347553


Coherence Model with alpha = asymmetric
-------------------------------------------


Coherence score with 5 topics is 0.3569745994597604


Coherence score with 6 topics is 0.31517034073823563


Coherence score with 7 topics is 0.3392142219267681


Coherence score with 8 topics is 0.2871019837116862


Coherence score with 9 topics is 0.2996681257915492




In [22]:
'''
Though the model with alpha='asymmetric' and with 12 topics returned the highest coherence score. For the business use and simplicity, max topics of 7
and alpha='asymmetric' as the final model. This also returned a comparable coherence score of 0.6262
'''
lda_model_final = gensim.models.LdaMulticore(corpus=bow_corpus,num_topics=7,id2word=dictionary,passes=10,workers=2,alpha='asymmetric')

In [23]:
from pprint import pprint
# Print the Keyword in the 7 topics
pprint(lda_model_final.print_topics())


[(0,
  '0.007*"haul" + 0.006*"toilet" + 0.006*"short" + 0.006*"carrier" + '
  '0.006*"snack" + 0.005*"littl" + 0.005*"select" + 0.005*"free" + '
  '0.005*"europ" + 0.005*"room"'),
 (1,
  '0.005*"wine" + 0.005*"sleep" + 0.005*"secur" + 0.005*"came" + 0.005*"went" '
  '+ 0.005*"hard" + 0.004*"tray" + 0.004*"clear" + 0.004*"member" + '
  '0.004*"option"'),
 (2,
  '0.009*"rout" + 0.009*"week" + 0.008*"worst" + 0.006*"compani" + '
  '0.006*"york" + 0.006*"famili" + 0.005*"month" + 0.005*"home" + '
  '0.005*"complaint" + 0.005*"problem"'),
 (3,
  '0.008*"option" + 0.007*"attend" + 0.007*"nice" + 0.006*"upgrad" + '
  '0.006*"menu" + 0.006*"think" + 0.005*"aisl" + 0.005*"rout" + 0.005*"budget" '
  '+ 0.004*"start"'),
 (4,
  '0.008*"termin" + 0.008*"queue" + 0.007*"departur" + 0.006*"inform" + '
  '0.006*"point" + 0.006*"voucher" + 0.006*"desk" + 0.006*"allow" + '
  '0.006*"drop" + 0.006*"amsterdam"'),
 (5,
  '0.024*"refund" + 0.012*"miss" + 0.012*"email" + 0.012*"connect" + '
  '0.011*"phone" 

In [24]:
doc_lda = lda_model_final[bow_corpus]
doc_lda

<gensim.interfaces.TransformedCorpus at 0x25b78fafcd0>

In [25]:
# visulaise the topics
pyLDAvis.enable_notebook()
vis=pyLDAvis.gensim.prepare(lda_model,corpus,id2word)
vis

AttributeError: module 'pyLDAvis' has no attribute 'gensim'

In [None]:
lda = decomposition.LatentDirichletAllocation(n_components=10, *,
 doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1000000.0, perp_tol=0.1,
 mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None)