# Imports

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np

import tqdm
tqdm.tqdm.pandas()

# Pull posts data from snowflake

In [4]:
from utils.get_data import get_posts

In [5]:
query = """
    SELECT distinct post_content || 'post_id: ' || post_id as post_content
        , post_id || '_' || site_prefix post_id
    FROM prod."posts"
    WHERE post_status = 'publish' and post_type = 'post'
    and site_prefix = 'WP'
    ORDER BY POST_ID DESC
    LIMIT 20000
"""

In [6]:
posts_df = get_posts(query)

In [7]:
posts_df

Unnamed: 0,POST_CONTENT,POST_ID
0,"As a big sister to four very sweet, hilarious,...",99918_WP
1,I spent my childhood growing up on a farm/ranc...,99917_WP
2,When am I going to get better at managing the ...,99841_WP
3,I know I am not alone in struggling to find a ...,99838_WP
4,"If you:\r\n<ul>\r\n \t<li>are new to migraine,...",99836_WP
...,...,...
3467,We asked our contributors a series of question...,100098_WP
3468,I feel as though I am always walking a fine li...,100073_WP
3469,"I have used all <a href=""/migraine-treatment/t...",100062_WP
3470,and flooded a shared back hallway between a re...,100037_WP


# Cleaning Post Text

In [8]:
from utils.clean_text import remove_html
from utils.clean_text import remove_between_square_brackets
from utils.clean_text import remove_post_id
from utils.clean_text import remove_backslash_symbols

In [9]:
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_html(x))
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_between_square_brackets(x))
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_post_id(x))
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_backslash_symbols(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3472/3472 [00:03<00:00, 1014.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3472/3472 [00:00<00:00, 109890.83it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3472/3472 [00:00<00:00, 133220.72it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3472/3472 [00:00<00:00, 147380.06it/s]


In [10]:
posts_df.POST_CONTENT[0]

'As a big sister to four very sweet, hilarious, and very active younger brothers, I always want to provide fun and engaging entertainment whenever I spend time with them. I love to see a huge smile on their faces, and I love being able to provide or facilitate exciting environments and activities.That is why, despite my apprehension because of migraine...I volunteered to take my younger brother to his third basketball game of the season: loud cheering fans, bright lights, that super, incredibly obnoxious horn, screaming preteens, referee whistles and all. Sometimes it can be really tough to keep up with my kid brothers when I am experiencing pain from migraines, and even tougher to navigate that pain when I am in the role of caretaker for someone else.    Living with chronic migraine can often entail being caught in situations in which many of us are having to choose between our family, friends, jobs, and livelihoods or migraine. Here, I am going to discuss how I navigate a trigger lad

# LDA

In [11]:
import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)

# import libraries
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import re,random,os
import seaborn as sns
from nltk.corpus import stopwords
import string
from pprint import pprint as pprint

# spacy for basic processing, optional, can use nltk as well(lemmatisation etc.)
import spacy

#gensim for LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#plotting tools
import pyLDAvis
import pyLDAvis.gensim #dont skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [12]:
spacy.cli.download("en")

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the '/Users/abratun/Repos/KeywordExtraction/venv/bin/python -m pip install --upgrade pip' command.


In [13]:
# tokenize using gensims simple_preprocess
def sent_to_words(sentences, deacc=True):  # deacc=True removes punctuations
    for sentence in sentences:
        yield(simple_preprocess(str(sentence)))

# conver to list
data=posts_df['POST_CONTENT'].values.tolist()
data_words=list(sent_to_words(data))

In [14]:
#sample
print(data_words[0])

['as', 'big', 'sister', 'to', 'four', 'very', 'sweet', 'hilarious', 'and', 'very', 'active', 'younger', 'brothers', 'always', 'want', 'to', 'provide', 'fun', 'and', 'engaging', 'entertainment', 'whenever', 'spend', 'time', 'with', 'them', 'love', 'to', 'see', 'huge', 'smile', 'on', 'their', 'faces', 'and', 'love', 'being', 'able', 'to', 'provide', 'or', 'facilitate', 'exciting', 'environments', 'and', 'activities', 'that', 'is', 'why', 'despite', 'my', 'apprehension', 'because', 'of', 'migraine', 'volunteered', 'to', 'take', 'my', 'younger', 'brother', 'to', 'his', 'third', 'basketball', 'game', 'of', 'the', 'season', 'loud', 'cheering', 'fans', 'bright', 'lights', 'that', 'super', 'incredibly', 'obnoxious', 'horn', 'screaming', 'preteens', 'referee', 'whistles', 'and', 'all', 'sometimes', 'it', 'can', 'be', 'really', 'tough', 'to', 'keep', 'up', 'with', 'my', 'kid', 'brothers', 'when', 'am', 'experiencing', 'pain', 'from', 'migraines', 'and', 'even', 'tougher', 'to', 'navigate', 'that

In [15]:
# create a list of stop words
# string.punctuation (from the 'string' module) contains a list of punctuations
from nltk.corpus import stopwords
stop_words= stopwords.words('english') + list(string.punctuation)

In [16]:
# functions for removing stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts,allowed_postags=['NOUN','ADJ','VERB','ADV']):
    """https://spacy.io/api/annotation"""
    texts_out=[]
    for sent in texts:
        doc=nlp(' '.join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [17]:
# call functions

# remove stop words
data_words_npstops= remove_stopwords(data_words)

# initialize spacy 'en' model use only tagger since we don;t need parsing or NER
# python3 -m spacey download en
# spacy.cli.download("en")
nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])

# lemmatization keeping only noun, adj, vb, adv
data_lemmatized=lemmatization(data_words_npstops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [18]:
print(data_lemmatized[0])

['big', 'sister', 'sweet', 'hilarious', 'active', 'young', 'brother', 'always', 'want', 'provide', 'fun', 'engage', 'entertainment', 'spend', 'time', 'love', 'see', 'huge', 'smile', 'face', 'love', 'able', 'provide', 'facilitate', 'exciting', 'environment', 'activity', 'volunteer', 'take', 'young', 'brother', 'third', 'basketball', 'game', 'season', 'loud', 'cheer', 'fan', 'bright', 'light', 'super', 'incredibly', 'obnoxious', 'horn', 'scream', 'preteen', 'referee', 'whistle', 'sometimes', 'really', 'tough', 'keep', 'kid', 'brother', 'experience', 'pain', 'migraine', 'even', 'tough', 'navigate', 'pain', 'role', 'caretaker', 'else', 'live', 'chronic', 'often', 'entail', 'catch', 'situation', 'many', 'choose', 'family', 'friend', 'job', 'livelihood', 'migraine', 'go', 'discuss', 'situation', 'environment', 'know', 'many', 'trigger', 'present', 'decide', 'go', 'want', 'support', 'love', 'calm', 'storm', 'certainly', 'feel', 'anxious', 'take', 'brother', 'basketball', 'game', 'possible', '

In [19]:
# create dictionary and corpus
# create dictionary
id2word=corpora.Dictionary(data_lemmatized)

#create corpus
corpus=[id2word.doc2bow(text) for text in data_lemmatized]

In [20]:
# sample
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 4), (6, 2), (7, 3), (8, 2), (9, 2), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 3), (16, 3), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 14), (28, 2), (29, 1), (30, 2), (31, 1), (32, 2), (33, 2), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 1), (40, 2), (41, 2), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 4), (52, 1), (53, 2), (54, 1), (55, 1), (56, 2), (57, 1), (58, 2), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 1), (71, 3), (72, 1), (73, 1), (74, 3), (75, 1), (76, 1), (77, 1), (78, 2), (79, 2), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 11), (86, 2), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 1), (93, 1), (94, 3), (95, 18), (96, 4), (97, 1), (98, 10), (99, 1), (100, 4), (101, 1), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 1), (108, 2), (109, 1), (110,

In [21]:
from tqdm import tqdm

In [22]:
# compute coherence value at various values of alpha and num_topics
def compute_coherence_values(dictionary, corpus, texts, num_topics_range,alpha_range):
    coherence_values=[]
    model_list=[]
#     for alpha in alpha_range:
    for num_topics in tqdm(num_topics_range):
#         lda_model= gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, alpha=alpha,num_topics=num_topics,\
#                                                   per_word_topics=True)
        lda_model= gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary,num_topics=num_topics,random_state=100,\
                                          update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
        model_list.append(lda_model)
        coherencemodel=CoherenceModel(model=lda_model,texts=texts,dictionary=dictionary,coherence='c_v')
        coherence_values.append(('auto',num_topics,coherencemodel.get_coherence()))
    return model_list,coherence_values

In [23]:
# build models accross a range of num_topics and alpha
num_topics_range= [10,20,30,40,50,60,70,80,90,100]
alpha_range=[0.01,0.1,1]
model_list, coherence_values= compute_coherence_values(dictionary=id2word,corpus=corpus,texts=data_lemmatized,\
                                                       num_topics_range=num_topics_range,alpha_range=alpha_range)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [17:00<00:00, 102.04s/it]


In [24]:
coherence_df = pd.DataFrame(coherence_values, columns=['alpha', 'num_topics', 'coherence_value'])
coherence_df

Unnamed: 0,alpha,num_topics,coherence_value
0,auto,10,0.430997
1,auto,20,0.479101
2,auto,30,0.464114
3,auto,40,0.464928
4,auto,50,0.452799
5,auto,60,0.433113
6,auto,70,0.427706
7,auto,80,0.429208
8,auto,90,0.410564
9,auto,100,0.421976


In [37]:
# Build LDA model
lda_model= gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20,random_state=100,\
                                          update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)

In [38]:
# print the 10 topics
lda_model.print_topics(num_topics=20)

[(0,
  '0.042*"doctor" + 0.033*"treatment" + 0.031*"patient" + 0.021*"help" + 0.019*"use" + 0.017*"care" + 0.015*"find" + 0.015*"new" + 0.014*"therapy" + 0.014*"option"'),
 (1,
  '0.157*"dog" + 0.107*"holiday" + 0.100*"season" + 0.027*"animal" + 0.024*"winter" + 0.020*"candy" + 0.017*"organizer" + 0.010*"youtube" + 0.010*"affair" + 0.010*"gathering"'),
 (2,
  '0.017*"make" + 0.015*"help" + 0.015*"trigger" + 0.012*"find" + 0.012*"time" + 0.011*"stress" + 0.010*"keep" + 0.010*"body" + 0.009*"also" + 0.009*"migraine"'),
 (3,
  '0.025*"promotion" + 0.020*"winner" + 0.018*"prize" + 0.016*"sponsor" + 0.015*"rule" + 0.013*"official" + 0.012*"use" + 0.011*"insurance" + 0.011*"health" + 0.011*"enter"'),
 (4,
  '0.242*"trigger" + 0.199*"food" + 0.089*"diet" + 0.084*"eat" + 0.049*"allergy" + 0.025*"sugar" + 0.016*"gluten" + 0.013*"butter" + 0.012*"fat" + 0.012*"dairy"'),
 (5,
  '0.138*"medication" + 0.047*"take" + 0.034*"doctor" + 0.032*"insurance" + 0.029*"treatment" + 0.026*"prescription" + 0.0

In [39]:
# coherence score
coherence_model_lda=CoherenceModel(model=lda_model,texts=data_lemmatized,dictionary=id2word,coherence='c_v')
coherence_lda=coherence_model_lda.get_coherence()
print('\nCoherence Score:',coherence_lda)


Coherence Score: 0.47910051211621296


In [40]:
# visulaise the topics
pyLDAvis.enable_notebook()
vis=pyLDAvis.gensim.prepare(lda_model,corpus,id2word)
vis

In [59]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=posts_df.POST_CONTENT.to_list()):
    #init output
    sent_topics_df = pd.DataFrame()
    
    #get main topic in each document
    for i, row_list in tqdm(enumerate(ldamodel[corpus])):
        row = row_list[0] if ldamodel.per_word_topics else row_list  
        row = sorted(row, key=lambda x:(x[1]), reverse=True)
        #get dominant topic, perc contribution, and keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j==0: # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_contribution', 'Topic_keywords']
    
    #add original text to the end
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=posts_df.POST_CONTENT.to_list())

#format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_no', 'Dominant_topic', 'Topic_perc_contrib', 'Keywords', 'Text']

#show
#df_dominant_topic.head(10)

3472it [00:34, 102.08it/s]


In [61]:
df_dominant_topic.Keywords.value_counts()

feel, time, get, go, know, day, work, take, make, thing                                         1554
pain, migraine, attack, experience, symptom, day, time, get, feel, try                           570
make, help, trigger, find, time, stress, keep, body, also, migraine                              357
disease, migraine, risk, symptom, increase, include, patient, blood, woman, heart                234
doctor, treatment, patient, help, use, care, find, new, therapy, option                          189
people, chronic, live, health, migraine, condition, life, many, other, disease                   119
share, com, community, story, awareness, advocate, comment, member, post, article                104
light, head, sleep, go, get, feel, eye, back, night, car                                          98
cgrp, effect, drug, side, use, treatment, take, people, nerve, product                            89
medication, take, doctor, insurance, treatment, prescription, use, drug, prescribe, abortiv