# CMPE 351 Exercise 3

Analysis will be done on the title.


## Setting up

In [1]:
# Resolving paths in a platform agnostic way.
import logging
import multiprocessing
import pickle
from os.path import dirname, join, realpath
from pathlib import Path
from string import punctuation
from time import time

import nltk
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.corpora import Dictionary
from gensim.models import Word2Vec
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.porter import PorterStemmer
from spacy.lang.en import English

  from imp import reload
  from scipy.linalg.special_matrices import triu


In [2]:
def is_interactive():
    """Check if the script is being run interactively."""
    import __main__ as main

    return not hasattr(main, "__file__")


if is_interactive():
    SCRIPT_DIR = dirname(realpath("__file__"))
else:
    SCRIPT_DIR = dirname(realpath(__file__))

DATA_DIR = join(SCRIPT_DIR, "data")
MODELS_DIR = join(SCRIPT_DIR, "models")
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

In [3]:
raw_questions_df = pd.read_csv(join(DATA_DIR, "questions.csv"))
raw_questions_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,77434,14008.0,2008-09-16T21:40:29Z,171,How to access the last value in a vector?,<p>Suppose I have a vector that is nested in a...
1,79709,,2008-09-17T03:39:16Z,3,Worse sin: side effects or passing massive obj...,<p>I have a function inside a loop inside a fu...
2,95007,15842.0,2008-09-18T17:59:19Z,56,Explain the quantile() function in R,<p>I've been mystified by the R quantile funct...
3,103312,,2008-09-19T16:09:26Z,4,How to test for the EOF flag in R?,<p>How can I test for the <code>EOF</code> fla...
4,255697,1941213.0,2008-11-01T15:48:30Z,4,Is there an R package for learning a Dirichlet...,<p>I'm looking for a an <code>R</code> package...


In [4]:
pd.set_option("display.max_colwidth", None)

In [5]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /home/bryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/bryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bryan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/bryan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)

In [7]:
stop_words = set(stopwords.words("english"))

## Data Preprocessing

In [8]:
question_titles_df = raw_questions_df["Title"]
question_titles_df.head()

0                                How to access the last value in a vector?
1                      Worse sin: side effects or passing massive objects?
2                                     Explain the quantile() function in R
3                                       How to test for the EOF flag in R?
4    Is there an R package for learning a Dirichlet prior from counts data
Name: Title, dtype: object

Lower casing the text.

In [9]:
question_titles_df = question_titles_df.str.lower()
question_titles_df.head()

0                                how to access the last value in a vector?
1                      worse sin: side effects or passing massive objects?
2                                     explain the quantile() function in r
3                                       how to test for the eof flag in r?
4    is there an r package for learning a dirichlet prior from counts data
Name: Title, dtype: object

Removing Punctuation

In [10]:
PUNCTUATION_TO_REMOVE = punctuation


def remove_punctuation(text: str):
    """Remove punctuation form a string."""
    return text.translate(str.maketrans("", "", PUNCTUATION_TO_REMOVE))


question_titles_df = question_titles_df.apply(remove_punctuation)
question_titles_df.head()

0                                 how to access the last value in a vector
1                        worse sin side effects or passing massive objects
2                                       explain the quantile function in r
3                                        how to test for the eof flag in r
4    is there an r package for learning a dirichlet prior from counts data
Name: Title, dtype: object

Removing stop words

In [11]:
def remove_stopwords(text: str):
    """Remove stopwords from a string."""
    return " ".join(
        [word for word in str(text).split() if word not in stop_words]
    )


question_titles_df = question_titles_df.apply(remove_stopwords)
question_titles_df.head()

0                          access last value vector
1    worse sin side effects passing massive objects
2                       explain quantile function r
3                                   test eof flag r
4    r package learning dirichlet prior counts data
Name: Title, dtype: object

Stemming the words.

In [12]:
stemmer = PorterStemmer()


def stem_words(text: str):
    """Stem words in a string."""
    return " ".join([stemmer.stem(word) for word in text.split()])


question_titles_df = question_titles_df.apply(stem_words)
question_titles_df.head()

0                      access last valu vector
1      wors sin side effect pass massiv object
2                   explain quantil function r
3                              test eof flag r
4    r packag learn dirichlet prior count data
Name: Title, dtype: object

Tokenizing the text.

In [13]:
question_titles_df = question_titles_df.apply(nltk.word_tokenize)
question_titles_df.head()

0                         [access, last, valu, vector]
1      [wors, sin, side, effect, pass, massiv, object]
2                      [explain, quantil, function, r]
3                                 [test, eof, flag, r]
4    [r, packag, learn, dirichlet, prior, count, data]
Name: Title, dtype: object

## Learning word vectors from text corpus

In [14]:
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(
    min_count=1,
    workers=cores - 1,
)

INFO - 21:55:54: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.025)', 'datetime': '2022-03-31T21:55:54.732947', 'gensim': '4.1.2', 'python': '3.9.11 (main, Mar 22 2022, 10:11:10) \n[GCC 9.3.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'created'}


In [15]:
t = time()
w2v_model.build_vocab(question_titles_df, progress_per=10000)
print("Time to build vocab: {} mins".format(round((time() - t) / 60, 2)))

INFO - 21:55:55: collecting all words and their counts
INFO - 21:55:55: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 21:55:55: PROGRESS: at sentence #10000, processed 55626 words, keeping 5288 word types
INFO - 21:55:55: PROGRESS: at sentence #20000, processed 111887 words, keeping 7900 word types
INFO - 21:55:55: PROGRESS: at sentence #30000, processed 169222 words, keeping 10051 word types
INFO - 21:55:55: PROGRESS: at sentence #40000, processed 227644 words, keeping 12035 word types
INFO - 21:55:55: PROGRESS: at sentence #50000, processed 286032 words, keeping 13767 word types
INFO - 21:55:55: PROGRESS: at sentence #60000, processed 345177 words, keeping 15411 word types
INFO - 21:55:55: PROGRESS: at sentence #70000, processed 404624 words, keeping 17002 word types
INFO - 21:55:55: PROGRESS: at sentence #80000, processed 464510 words, keeping 18534 word types
INFO - 21:55:55: PROGRESS: at sentence #90000, processed 524644 words, keeping 20032 word types
I

Time to build vocab: 0.06 mins


In [16]:
t = time()
w2v_model.train(
    question_titles_df,
    total_examples=w2v_model.corpus_count,
    epochs=1,
    report_delay=1,
)
print("Time to train the model: {} mins".format(round((time() - t) / 60, 2)))

INFO - 21:55:58: Word2Vec lifecycle event {'msg': 'training model with 11 workers on 33811 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-03-31T21:55:58.933038', 'gensim': '4.1.2', 'python': '3.9.11 (main, Mar 22 2022, 10:11:10) \n[GCC 9.3.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'train'}
INFO - 21:56:00: EPOCH 1 - PROGRESS: at 36.43% examples, 278134 words/s, in_qsize 18, out_qsize 3
INFO - 21:56:01: EPOCH 1 - PROGRESS: at 86.04% examples, 356421 words/s, in_qsize 17, out_qsize 0
INFO - 21:56:01: worker thread finished; awaiting finish of 10 more threads
INFO - 21:56:01: worker thread finished; awaiting finish of 9 more threads
INFO - 21:56:01: worker thread finished; awaiting finish of 8 more threads
INFO - 21:56:01: worker thread finished; awaiting finish of 7 more threads
INFO - 21:56:01: worker thread finished; awaiting finish of 6 more threads
INFO - 21:56:01:

Time to train the model: 0.04 mins


In [17]:
w2v_model.save(join(MODELS_DIR, "word2vec.model"))

INFO - 21:56:01: Word2Vec lifecycle event {'fname_or_handle': '/home/bryan/git/github.com/bryan-hoang/cmpe-351/exercises/exer_3/models/word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-31T21:56:01.924323', 'gensim': '4.1.2', 'python': '3.9.11 (main, Mar 22 2022, 10:11:10) \n[GCC 9.3.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'saving'}
INFO - 21:56:01: not storing attribute cum_table
INFO - 21:56:02: saved /home/bryan/git/github.com/bryan-hoang/cmpe-351/exercises/exer_3/models/word2vec.model


## Topic Modelling

In [18]:
question_titles_df = raw_questions_df["Title"]
question_titles_df.head()

0                                How to access the last value in a vector?
1                      Worse sin: side effects or passing massive objects?
2                                     Explain the quantile() function in R
3                                       How to test for the EOF flag in R?
4    Is there an R package for learning a Dirichlet prior from counts data
Name: Title, dtype: object

In [19]:
parser = English()


def tokenize(text: str):
    lda_tokens: list[str] = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        if token.like_url:
            lda_tokens.append("URL")
        elif token.orth_.startswith("@"):
            lda_tokens.append("SCREEN_NAME")
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [20]:
def get_lemma(word: str):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    return lemma

In [21]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [22]:
tokenized_question_titles = question_titles_df.apply(prepare_text_for_lda)
tokenized_question_titles.head()

0                         [access, value, vector]
1      [worse, effects, passing, massive, object]
2                   [explain, quantile, function]
3                                              []
4    [package, learning, dirichlet, prior, count]
Name: Title, dtype: object

In [23]:
dictionary = Dictionary(documents=tokenized_question_titles)
corpus = tokenized_question_titles.apply(dictionary.doc2bow)
pickle.dump(corpus, open(join(MODELS_DIR, "corpus.pkl"), "wb"))
dictionary.save(join(MODELS_DIR, "dictionary.gensim"))

INFO - 21:56:59: adding document #0 to Dictionary(0 unique tokens: [])
INFO - 21:57:00: adding document #10000 to Dictionary(4376 unique tokens: ['access', 'value', 'vector', 'effects', 'massive']...)
INFO - 21:57:00: adding document #20000 to Dictionary(6316 unique tokens: ['access', 'value', 'vector', 'effects', 'massive']...)
INFO - 21:57:00: adding document #30000 to Dictionary(7876 unique tokens: ['access', 'value', 'vector', 'effects', 'massive']...)
INFO - 21:57:00: adding document #40000 to Dictionary(9372 unique tokens: ['access', 'value', 'vector', 'effects', 'massive']...)
INFO - 21:57:01: adding document #50000 to Dictionary(10665 unique tokens: ['access', 'value', 'vector', 'effects', 'massive']...)
INFO - 21:57:01: adding document #60000 to Dictionary(11932 unique tokens: ['access', 'value', 'vector', 'effects', 'massive']...)
INFO - 21:57:01: adding document #70000 to Dictionary(13098 unique tokens: ['access', 'value', 'vector', 'effects', 'massive']...)
INFO - 21:57:01:

> Briefly specify how you [picked] the number of topics.

I picked 5 as the number of topics because it was the first number used in [this tutorial I followed](https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21#afa6).

In [24]:
NUM_TOPICS = 5
ldamodel = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary)
ldamodel.save(join(MODELS_DIR, "lda_model_5.gensim"))
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

INFO - 21:57:06: using symmetric alpha at 0.2
INFO - 21:57:06: using symmetric eta at 0.2
INFO - 21:57:06: using serial LDA version on this node
INFO - 21:57:06: running online (single-pass) LDA training, 5 topics, 1 passes over the supplied corpus of 189930 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
INFO - 21:57:06: PROGRESS: pass 0, at document #2000/189930
INFO - 21:57:07: merging changes from 2000 documents into a model of 189930 documents
INFO - 21:57:07: topic #0 (0.200): 0.046*"column" + 0.043*"using" + 0.023*"variable" + 0.016*"function" + 0.011*"ggplot2" + 0.011*"label" + 0.010*"command" + 0.010*"create" + 0.009*"multiple" + 0.009*"plot"
INFO - 21:57:07: topic #1 (0.200): 0.030*"vector" + 0.024*"package" + 0.022*"frame" + 0.019*"error" + 0.018*"using" + 0.018*"plot" + 0.018*"multiple" + 0.018*"function" + 0.014*"column" + 0.013*"number"
INFO - 21:57:07: topic #2 (0.200

(0, '0.098*"column" + 0.040*"values" + 0.040*"frame" + 0.035*"multiple"')
(1, '0.080*"error" + 0.056*"shiny" + 0.041*"package" + 0.028*"using"')
(2, '0.039*"ggplot2" + 0.033*"ggplot" + 0.029*"plot" + 0.023*"using"')
(3, '0.028*"model" + 0.024*"calculate" + 0.023*"using" + 0.021*"match"')
(4, '0.091*"function" + 0.036*"using" + 0.025*"dplyr" + 0.025*"object"')


## Summmary

In [25]:
dictionary = Dictionary.load(join(MODELS_DIR, "dictionary.gensim"))
corpus = pickle.load(open(join(MODELS_DIR, "corpus.pkl"), "rb"))
lda = LdaModel.load(join(MODELS_DIR, "lda_model_5.gensim"))
lda_display = gensimvis.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

INFO - 21:58:51: loading Dictionary object from /home/bryan/git/github.com/bryan-hoang/cmpe-351/exercises/exer_3/models/dictionary.gensim
INFO - 21:58:51: Dictionary lifecycle event {'fname': '/home/bryan/git/github.com/bryan-hoang/cmpe-351/exercises/exer_3/models/dictionary.gensim', 'datetime': '2022-03-31T21:58:51.467828', 'gensim': '4.1.2', 'python': '3.9.11 (main, Mar 22 2022, 10:11:10) \n[GCC 9.3.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'loaded'}
INFO - 21:58:52: loading LdaModel object from /home/bryan/git/github.com/bryan-hoang/cmpe-351/exercises/exer_3/models/lda_model_5.gensim
INFO - 21:58:52: loading expElogbeta from /home/bryan/git/github.com/bryan-hoang/cmpe-351/exercises/exer_3/models/lda_model_5.gensim.expElogbeta.npy with mmap=None
INFO - 21:58:52: setting ignored attribute id2word to None
INFO - 21:58:52: setting ignored attribute state to None
INFO - 21:58:52: setting ignored attribute dispatcher to None
INFO - 21:58:5