In [25]:
import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)

# import libraries
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import re,random,os
import seaborn as sns
from nltk.corpus import stopwords
import string
from pprint import pprint as pprint

# spacy for basic processing, optional, can use nltk as well(lemmatisation etc.)
import spacy

#gensim for LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

In [26]:
spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [27]:
df = pd.read_csv('/Users/yokk/Desktop/ARP-Grecale/MaseratiForum/MaseratiForum.csv')

Tokenize each comment (using gensim)
Remove stop words (including punctuations)
Lemmatize (using spacy)

In [28]:
# tokenize using gensims simple_preprocess

def sent_to_words(sentences, deacc=True):  # deacc=True removes punctuations
    for sentence in sentences:
        yield(simple_preprocess(str(sentence)))

# convert to list
data=df['Comment'].values.tolist()
data_words=list(sent_to_words(data))

print(data_words[3])

['thanks', 'for', 'the', 'reply', 'modenacanada', 'so', 'obviously', 'not', 'just', 'right', 'hand', 'drive', 'cars', 'delayed', 'sure', 'they', 'will', 'turn', 'up', 'eventually', 'and', 'yes', 'thinking']


In [29]:
# create a list of stop words
# string.punctuation (from the 'string' module) contains a list of punctuations

from nltk.corpus import stopwords
stop_words= stopwords.words('english') + list(string.punctuation)

In [30]:
# functions for removing stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts,allowed_postags=['NOUN','ADJ','VERB','ADV']):
    """https://spacy.io/api/annotation"""
    texts_out=[]
    for sent in texts:
        doc=nlp(' '.join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [31]:
# call functions
nlp = spacy.load("en_core_web_sm")
# remove stop words
data_words_npstops= remove_stopwords(data_words)

# initialize spacy 'en' model use only tagger since we don;t need parsing or NER
# python3 -m spacey download en
# spacy.cli.download("en")
nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])

# lemmatization keeping only noun, adj, vb, adv
data_lemmatized=lemmatization(data_words_npstops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[3])

['thank', 'reply', 'obviously', 'right', 'hand', 'drive', 'car', 'delay', 'sure', 'turn', 'eventually', 'think']


In [32]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 3), (4, 1), (5, 1), (6, 1), (7, 1), (8, 3), (9, 2), (10, 2), (11, 1), (12, 1), (13, 1), (14, 4), (15, 1), (16, 2), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 4), (27, 2), (28, 1), (29, 1), (30, 1), (31, 4), (32, 1), (33, 1), (34, 2), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2)]]


In [33]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [34]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.162*"assist" + 0.103*"thank" + 0.061*"post" + 0.060*"care" + '
  '0.030*"forum" + 0.027*"provide" + 0.020*"com" + 0.019*"team" + 0.019*"sign" '
  '+ 0.017*"exist"'),
 (1,
  '0.100*"excited" + 0.094*"possible" + 0.030*"note" + 0.023*"appreciate" + '
  '0.021*"exact" + 0.019*"swap" + 0.017*"fact" + 0.016*"suppose" + '
  '0.013*"follow" + 0.011*"white"'),
 (2,
  '0.106*"car" + 0.087*"issue" + 0.047*"also" + 0.040*"software" + 0.033*"get" '
  '+ 0.032*"come" + 0.030*"time" + 0.027*"back" + 0.026*"even" + 0.026*"long"'),
 (3,
  '0.063*"sound" + 0.043*"stop" + 0.036*"car" + 0.031*"sometimes" + '
  '0.026*"message" + 0.024*"ever" + 0.023*"mile" + 0.023*"low" + '
  '0.023*"modena" + 0.022*"also"'),
 (4,
  '0.046*"car" + 0.029*"dealer" + 0.028*"week" + 0.026*"much" + 0.024*"say" + '
  '0.022*"go" + 0.022*"get" + 0.021*"month" + 0.020*"phone" + 0.019*"last"'),
 (5,
  '0.090*"progress" + 0.071*"agree" + 0.053*"production" + 0.050*"quote" + '
  '0.045*"fit" + 0.032*"less" + 0.030*"line" 

You can see the top keywords and weights associated with keywords contributing to topic.
Topics are words with highest probability in topic and the numbers are the probabilities of words appearing in topic distribution.
But looking at keywords can you guess what the topic is?
You may summarize topic-4 as space(In the above figure). Each one may have different topic at particular number , topic 4 might not be in the same place where it is now, it may be in topic 10 or any number.

In [35]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -12.831918666125588

Coherence Score:  0.3130766593343335


In [None]:
# functions for removing stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts,allowed_postags=['NOUN','ADJ','VERB','ADV']):
    """https://spacy.io/api/annotation"""
    texts_out=[]
    for sent in texts:
        doc=nlp(' '.join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

Lower the perplexity better the model.
Higher the topic coherence, the topic is more human interpretable.


In [38]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda_model, corpus, id2word, n_jobs=1)

In [39]:
lda_viz