In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore")#,category=DeprecationWarning)



In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
import os
from glob import glob

def get_file_names(foldername):
    cwd = os.getcwd()
    files = glob(cwd + f"\\{foldername}\\*.txt")
    files = [f"{foldername}\\"+os.path.basename(x) for x in files]
    return files

In [4]:
import tools

def noun_extraction_almawadie(foldername):
    noun_list = tools.read_lines()
    files = get_file_names(foldername)
    files = [bytes(file, 'utf-8') for file in files]
    
    res = tools.doc_to_list(files, noun_list)
    return res

In [5]:
data_words = noun_extraction_almawadie("scripts_small")
data_words = [[word.decode("utf") for word in page]for page in data_words]

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['screenplay', 'sam', 'hamm', 'hells', 'christmas', 'weeks', 'snow', 'white', 'blanket', 'city', 'serene', 'peace', 'strangers', 'wave', 'salvation', 'army', 'ring', 'bells', 'as', 'night', 'falls', 'sign', 'broad_avenue', 'noel', 'shopping', 'days', 'left', 'christmas', 'streets', 'shoppers', 'souvenir', 'store', 'mom', 'squabbling', 'year', 'old', 'storefronts', 'batman', 'merchandise', 'key', 'chains', 'figurines', 'kid', 'batman', 'baseball', 'cap', 'little', 'black', 'cape', 'mom', 'store', 'window', 'scrap', 'metal', 'sign', 'reading', 'fragments', 'panhandler', 'entrance', 'his', 'array', 'jacket', 'sweatshirt', 'logo', 'winter', 'hours', 'blizzard', 'drifts', 'feet', 'deep', 'streets', 'cars', 'snow', 'plow', 'size', 'tank', 'frame', 'as', 'group', 'mufflers', 'weather', 'street', 'spreading', 'cheer', 'goodwill', 'lovely', 'rendition', 'silent', 'night', 'delivery', 'truck', 'snow', 'chains', 'wake', 'plow', 'sides', 'cartoon', 'advertising', 'popular', 'ice', 'cream', 'snack'

In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [7]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])

print(data_lemmatized[:1])

[['sam', 'hamm', 'christmas', 'week', 'snow', 'blanket', 'city', 'peace', 'stranger', 'salvation', 'army', 'ring', 'bell', 'night', 'shopping', 'day', 'christmas', 'street', 'shopper', 'souvenir', 'store', 'mom', 'year', 'storefront', 'batman', 'merchandise', 'chain', 'figurine', 'kid', 'batman', 'baseball', 'cap', 'cape', 'mom', 'store', 'window', 'scrap', 'metal', 'sign', 'reading', 'fragment', 'entrance', 'array', 'jacket', 'sweatshirt', 'logo', 'winter', 'hour', 'blizzard', 'foot', 'street', 'car', 'snow', 'plow', 'size', 'tank', 'frame', 'group', 'muffler', 'street', 'cheer', 'goodwill', 'rendition', 'night', 'delivery', 'truck', 'snow', 'chain', 'plow', 'side', 'cartoon', 'advertising', 'ice', 'cream', 'snack', 'bar', 'time', 'year', 'santa', 'bell', 'bucket', 'wave', 'truck', 'rounds_corner', 'carol', 'explosion', 'rock', 'street', 'gunfire', 'security', 'alarm', 'corner', 'carnage', 'street', 'window', 'jewelry', 'store', 'security', 'guard', 'snow', 'robber', 'camouflage', 'ge

In [8]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 3), (1, 1), (2, 5), (3, 3), (4, 5), (5, 1), (6, 4), (7, 1), (8, 1), (9, 2), (10, 1), (11, 2), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 11), (22, 2), (23, 6), (24, 1), (25, 2), (26, 2), (27, 1), (28, 1), (29, 1), (30, 4), (31, 2), (32, 2), (33, 2), (34, 1), (35, 3), (36, 1), (37, 1), (38, 3), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 2), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 36), (51, 5), (52, 3), (53, 1), (54, 3), (55, 1), (56, 1), (57, 1), (58, 1), (59, 5), (60, 1), (61, 2), (62, 2), (63, 4), (64, 1), (65, 4), (66, 1), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 5), (76, 1), (77, 1), (78, 1), (79, 1), (80, 3), (81, 1), (82, 2), (83, 7), (84, 1), (85, 3), (86, 7), (87, 5), (88, 2), (89, 9), (90, 2), (91, 2), (92, 14), (93, 2), (94, 6), (95, 2), (96, 4), (97, 2), (98, 1), (99, 35), (100, 1), (101, 111), (102, 27), (103, 2), (104, 2), (105, 3), (106, 1), (107, 3), (108, 8), (109, 2), (

In [9]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('accelerator', 3),
  ('accent', 1),
  ('access', 5),
  ('acknowledgement', 3),
  ('action', 5),
  ('activity', 1),
  ('ad', 4),
  ('address', 1),
  ('adult', 1),
  ('advance', 2),
  ('advertisement', 1),
  ('advertising', 2),
  ('advise', 1),
  ('aegis', 1),
  ('afternoon', 2),
  ('age', 1),
  ('agility', 1),
  ('aging', 1),
  ('agitation', 1),
  ('aid', 1),
  ('aim', 1),
  ('air', 11),
  ('aisle', 2),
  ('alarm', 6),
  ('aluminum', 1),
  ('amazement', 2),
  ('ambulance', 2),
  ('amusement', 1),
  ('ancestor', 1),
  ('andy', 1),
  ('angel', 4),
  ('angle', 2),
  ('animal', 2),
  ('answer', 2),
  ('antenna', 1),
  ('anticipation', 3),
  ('antique', 1),
  ('antiquity', 1),
  ('apartment', 3),
  ('ape', 1),
  ('appearance', 1),
  ('appliance', 1),
  ('application', 1),
  ('approach', 2),
  ('apron', 2),
  ('arc', 1),
  ('arch', 1),
  ('archaeologist', 1),
  ('area', 1),
  ('argument', 1),
  ('arm', 36),
  ('armor', 5),
  ('armory', 3),
  ('armpit', 1),
  ('army', 3),
  ('arraignment', 

In [10]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.021*"jar" + 0.017*"ship" + 0.011*"han" + 0.011*"rey" + 0.008*"vader" + '
  '0.008*"head" + 0.008*"move" + 0.008*"side" + 0.007*"door" + 0.007*"fire"'),
 (1,
  '0.000*"ship" + 0.000*"hand" + 0.000*"moment" + 0.000*"eye" + 0.000*"sir" + '
  '0.000*"captain" + 0.000*"bruce" + 0.000*"kirk" + 0.000*"time" + '
  '0.000*"head"'),
 (2,
  '0.021*"bruce" + 0.016*"batman" + 0.015*"vicki" + 0.012*"joker" + '
  '0.009*"hand" + 0.009*"car" + 0.008*"cop" + 0.007*"door" + 0.006*"face" + '
  '0.006*"street"'),
 (3,
  '0.027*"peter" + 0.018*"liz" + 0.010*"door" + 0.009*"flash" + 0.009*"power" '
  '+ 0.008*"time" + 0.007*"way" + 0.007*"wall" + 0.007*"weiner" + '
  '0.007*"scimitar"'),
 (4,
  '0.028*"kirk" + 0.021*"ship" + 0.015*"captain" + 0.012*"enterprise" + '
  '0.011*"sir" + 0.010*"datum" + 0.009*"moment" + 0.009*"eye" + 0.008*"mccoy" '
  '+ 0.008*"bridge"')]


In [11]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.076144169418216

Coherence Score:  0.3659236205576027


In [1]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

NameError: name 'pyLDAvis' is not defined