### Delivery 5 - Natural Language Processing - Topic Modelling

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os 
root_dir = "/content/drive/My Drive/"
project_folder = "Colab Notebooks/MBD/"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')
  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

create_and_set_working_directory(project_folder)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random

Reading the questions from the quora dataset

In [59]:
qq = pd.read_csv('quora_questions.csv')
qq = qq.head(1000)

ATTEMPT 1 : Coherence

In [60]:
import re
# Remove punctuation
qq['Question_processed'] = qq['Question'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
qq['Question_processed'] = qq['Question_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
qq['Question_processed'].head()

0    what is the step by step guide to invest in sh...
1    what is the story of kohinoor (koh-i-noor) dia...
2    how can i increase the speed of my internet co...
3     why am i mentally very lonely how can i solve it
4    which one dissolve in water quikly sugar salt ...
Name: Question_processed, dtype: object

In [61]:
import gensim
from gensim.utils import simple_preprocess
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data = qq.Question_processed.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india']


In [62]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [63]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [64]:
import spacy
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

[['step', 'step', 'invest', 'share', 'market']]


In [65]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 2)]]


In [66]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
lda_model.num_topics

10

In [67]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.025*"cause" + 0.025*"school" + 0.018*"weight" + 0.018*"student" + '
  '0.017*"share" + 0.016*"hard" + 0.016*"would" + 0.016*"find" + 0.015*"know" '
  '+ 0.015*"value"'),
 (1,
  '0.041*"good" + 0.040*"movie" + 0.035*"people" + 0.027*"become" + '
  '0.021*"job" + 0.020*"start" + 0.018*"improve" + 0.017*"year" + '
  '0.014*"think" + 0.014*"old"'),
 (2,
  '0.034*"learn" + 0.026*"language" + 0.017*"programming" + 0.017*"medicine" + '
  '0.015*"computer" + 0.015*"ask" + 0.014*"ever" + 0.013*"child" + '
  '0.012*"video" + 0.012*"world"'),
 (3,
  '0.150*"good" + 0.025*"much" + 0.024*"life" + 0.017*"place" + 0.015*"class" '
  '+ 0.015*"exam" + 0.014*"important" + 0.013*"start" + 0.012*"way" + '
  '0.011*"make"'),
 (4,
  '0.022*"go" + 0.016*"love" + 0.015*"really" + 0.013*"import" + 0.013*"back" '
  '+ 0.012*"term" + 0.012*"benefit" + 0.011*"effect" + 0.010*"cost" + '
  '0.010*"part"'),
 (5,
  '0.027*"number" + 0.026*"change" + 0.023*"feel" + 0.022*"work" + '
  '0.021*"money" + 0.020*"

In [69]:
from gensim.models import CoherenceModel
# Compute Coherence Score
co = dict()
for i in range(3,30,3):
  lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=i, 
                                        random_state=100,
                                        chunksize=100,
                                        passes=10,
                                        per_word_topics=True)
  coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
  coherence_lda = coherence_model_lda.get_coherence()
  co[i] = coherence_lda
  print('\nCoherence Score: ', coherence_lda, 'with',lda_model.num_topics,'topics')
print(max(co))


Coherence Score:  0.6666668149451039 with 3 topics

Coherence Score:  0.6773449112717861 with 6 topics

Coherence Score:  0.6893040279020851 with 9 topics

Coherence Score:  0.6736481552595103 with 12 topics

Coherence Score:  0.6760151670625484 with 15 topics

Coherence Score:  0.6399090394908489 with 18 topics

Coherence Score:  0.630328597396334 with 21 topics

Coherence Score:  0.6267182558654087 with 24 topics

Coherence Score:  0.6067635849505998 with 27 topics
{3: 0.6666668149451039, 6: 0.6773449112717861, 9: 0.6893040279020851, 12: 0.6736481552595103, 15: 0.6760151670625484, 18: 0.6399090394908489, 21: 0.630328597396334, 24: 0.6267182558654087, 27: 0.6067635849505998}


In [80]:
def max_dic(d): 
  maxim = 0
  ma = 0
  for key in d.keys():
    if d[key] > maxim:
      maxim = d[key]
      ma = key
  return ma
print("The number of topics with the highest coherence value associated is", max_dic(co))

The number of topics with the highest coherence value associated is 9




---


---



---



---




ATTEMPT 2 : Hierarchical Dirichlet Process

In [None]:
from gensim.test.utils import common_corpus, common_dictionary



---


---



---



---




In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
########################################
## function for attempt 1 ##
## DOES NOT RUN ##
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()



---

