# TOPIC MODELLING USING Latent Dirichlet Allocation (LDA) 

In [169]:
# importing libraries
import pandas as pd
import re
import string
import numpy as np
from gensim import models,corpora
import nltk
from nltk.corpus import stopwords
from gensim.models import LdaModel
from gensim.corpora import Dictionary

In [170]:
from nltk.stem.wordnet import WordNetLemmatizer
wn = WordNetLemmatizer()

In [171]:
df=pd.read_csv('review.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,review
0,0,It's not really a review but my attempt to exp...
1,1,"Every once in a while a movie comes, that trul..."
2,2,"Last night COL Ferry and I (COL Coldwell, both..."
3,3,Nothing was typical about this. Everything was...
4,4,Legal historians and courtroom drama fans will...


In [172]:
df.drop(['Unnamed: 0'], inplace=True, axis=1)
df.head()

Unnamed: 0,review
0,It's not really a review but my attempt to exp...
1,"Every once in a while a movie comes, that trul..."
2,"Last night COL Ferry and I (COL Coldwell, both..."
3,Nothing was typical about this. Everything was...
4,Legal historians and courtroom drama fans will...


## DATA PREPROCESSING
We use Wordnet to find the meanings of words, synonyms, antonyms, and more. In addition, we use WordNetLemmatizer to get the root word. Filter out stop words.

In [173]:
docs = [x for x in df['review']]

In [174]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [175]:
# A function to prepare the text for topic modelling
def words(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    text = regex.sub(" ", text.lower())
    words = text.split(" ")
    words = [re.sub('\S*@\S*\s?', '', sent) for sent in words]
    words = [re.sub('\s+', ' ', sent) for sent in words]
    words = [re.sub("\'", "", sent) for sent in words]
    words = [w for w in words if not len(w) < 2]
    words = [w for w in words if w not in stop_words]
    words = [wn.lemmatize(w) for w in words]
    
    return words

In [176]:
nltk.download('wordnet')
docs = [words(x) for x in df['review']]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [177]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
print('Number of unique words in initital documents:', len(dictionary))

Number of unique words in initital documents: 1382


In [178]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f7d15db6610>

In [179]:
create corpus or bag of words
corpus = [dictionary.doc2bow(doc) for doc in docs]
print(len(corpus))

20


In [180]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 3),
  (8, 1),
  (9, 1),
  (10, 2),
  (11, 1),
  (12, 2),
  (13, 3),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 2),
  (18, 2),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 1),
  (36, 3),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 2),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 2),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 2),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 3),
  (79, 1),
  (80, 2),
  (81, 1),
  (82, 1),
  (83, 2),
  (84, 1),
  (85, 1),
  (86, 3),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1)

In [181]:
bow_doc_20 = corpus[19]

for i in range(len(bow_doc_20)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_20[i][0], 
                                                     dictionary[bow_doc_20[i][0]], 
                                                     bow_doc_20[i][1]))

Word 6 ("along") appears 1 time.
Word 87 ("every") appears 1 time.
Word 99 ("film") appears 5 time.
Word 147 ("like") appears 1 time.
Word 169 ("narrative") appears 1 time.
Word 170 ("nature") appears 1 time.
Word 235 ("son") appears 1 time.
Word 268 ("wound") appears 1 time.
Word 271 ("action") appears 2 time.
Word 287 ("impact") appears 1 time.
Word 345 ("consequence") appears 1 time.
Word 349 ("design") appears 1 time.
Word 401 ("long") appears 1 time.
Word 448 ("set") appears 1 time.
Word 481 ("war") appears 1 time.
Word 482 ("warfare") appears 1 time.
Word 483 ("way") appears 1 time.
Word 663 ("take") appears 1 time.
Word 677 ("violence") appears 1 time.
Word 694 ("fantastic") appears 1 time.
Word 788 ("past") appears 1 time.
Word 789 ("piece") appears 1 time.
Word 803 ("scene") appears 2 time.
Word 812 ("thank") appears 1 time.
Word 832 ("car") appears 1 time.
Word 872 ("game") appears 1 time.
Word 895 ("seeing") appears 1 time.
Word 997 ("honest") appears 1 time.
Word 1036 ("sou


##Topic Modeling using LDA

LDA : Latent Dirichlet Allocation is a probabilistic model. It is a unsupervised machine learning technique. And to obtain cluster assignments, it uses two probability values: P( word | topics) and P( topics | documents).


In [182]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [183]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word = dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=500,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [184]:
get_lda_topics(lda_model, 10)

Unnamed: 0,Topic 01,Topic 02,Topic 03,Topic 04,Topic 05,Topic 06,Topic 07,Topic 08,Topic 09,Topic 10
0,film,ki,film,movie,movie,movie,movie,food,film,timeline
1,movie,park,watch,ending,film,alita,everything,floor,movie,red
2,become,mr,war,like,well,story,good,month,like,blue
3,one,taek,protagonist,going,see,good,watch,level,hemsworth,protagonist
4,however,film,even,film,time,people,done,movie,see,film
5,world,family,viewer,actually,really,world,story,girl,take,get
6,also,movie,know,watch,also,love,lover,prisoner,scene,sator
7,high,trial,served,see,made,film,mystery,miharu,narrative,kat
8,month,instead,mission,horror,thriller,life,highly,platform,fight,evil
9,stab,also,trench,list,make,focus,want,protagonist,wick,back
