# CMPE 351 Exercise 3

Analysis will be done on the title.


## Setting up

In [1]:
# Resolving paths in a platform agnostic way.
import multiprocessing
import pickle
from os.path import dirname, join, realpath
from pathlib import Path
from string import punctuation

import nltk
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.corpora import Dictionary
from gensim.models import Word2Vec
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.porter import PorterStemmer
from spacy.lang.en import English

  from imp import reload
  from scipy.linalg.special_matrices import triu


In [2]:
def is_interactive():
    """Check if the script is being run interactively."""
    import __main__ as main

    return not hasattr(main, "__file__")


if is_interactive():
    SCRIPT_DIR = dirname(realpath("__file__"))
else:
    SCRIPT_DIR = dirname(realpath(__file__))

DATA_DIR = join(SCRIPT_DIR, "data")
MODELS_DIR = join(SCRIPT_DIR, "models")
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

In [3]:
raw_questions_df = pd.read_csv(join(DATA_DIR, "questions.csv"))
raw_questions_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,77434,14008.0,2008-09-16T21:40:29Z,171,How to access the last value in a vector?,<p>Suppose I have a vector that is nested in a...
1,79709,,2008-09-17T03:39:16Z,3,Worse sin: side effects or passing massive obj...,<p>I have a function inside a loop inside a fu...
2,95007,15842.0,2008-09-18T17:59:19Z,56,Explain the quantile() function in R,<p>I've been mystified by the R quantile funct...
3,103312,,2008-09-19T16:09:26Z,4,How to test for the EOF flag in R?,<p>How can I test for the <code>EOF</code> fla...
4,255697,1941213.0,2008-11-01T15:48:30Z,4,Is there an R package for learning a Dirichlet...,<p>I'm looking for a an <code>R</code> package...


In [4]:
pd.set_option("display.max_colwidth", None)

In [5]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /home/bryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/bryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bryan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/bryan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
stop_words = set(stopwords.words("english"))

## Data Preprocessing

In [7]:
question_titles_df = raw_questions_df["Title"]
question_titles_df.head()

0                                How to access the last value in a vector?
1                      Worse sin: side effects or passing massive objects?
2                                     Explain the quantile() function in R
3                                       How to test for the EOF flag in R?
4    Is there an R package for learning a Dirichlet prior from counts data
Name: Title, dtype: object

Lower casing the text.

In [8]:
question_titles_df = question_titles_df.str.lower()
question_titles_df.head()

0                                how to access the last value in a vector?
1                      worse sin: side effects or passing massive objects?
2                                     explain the quantile() function in r
3                                       how to test for the eof flag in r?
4    is there an r package for learning a dirichlet prior from counts data
Name: Title, dtype: object

Removing Punctuation

In [9]:
PUNCTUATION_TO_REMOVE = punctuation


def remove_punctuation(text: str):
    """Remove punctuation form a string."""
    return text.translate(str.maketrans("", "", PUNCTUATION_TO_REMOVE))


question_titles_df = question_titles_df.apply(remove_punctuation)
question_titles_df.head()

0                                 how to access the last value in a vector
1                        worse sin side effects or passing massive objects
2                                       explain the quantile function in r
3                                        how to test for the eof flag in r
4    is there an r package for learning a dirichlet prior from counts data
Name: Title, dtype: object

Removing stop words

In [10]:
def remove_stopwords(text: str):
    """Remove stopwords from a string."""
    return " ".join(
        [word for word in str(text).split() if word not in stop_words]
    )


question_titles_df = question_titles_df.apply(remove_stopwords)
question_titles_df.head()

0                          access last value vector
1    worse sin side effects passing massive objects
2                       explain quantile function r
3                                   test eof flag r
4    r package learning dirichlet prior counts data
Name: Title, dtype: object

Stemming the words.

In [11]:
stemmer = PorterStemmer()


def stem_words(text: str):
    """Stem words in a string."""
    return " ".join([stemmer.stem(word) for word in text.split()])


question_titles_df = question_titles_df.apply(stem_words)
question_titles_df.head()

0                      access last valu vector
1      wors sin side effect pass massiv object
2                   explain quantil function r
3                              test eof flag r
4    r packag learn dirichlet prior count data
Name: Title, dtype: object

Tokenizing the text.

In [12]:
question_titles_df = question_titles_df.apply(nltk.word_tokenize)
question_titles_df.head()

0                         [access, last, valu, vector]
1      [wors, sin, side, effect, pass, massiv, object]
2                      [explain, quantil, function, r]
3                                 [test, eof, flag, r]
4    [r, packag, learn, dirichlet, prior, count, data]
Name: Title, dtype: object

## Learning word vectors from text corpus

In [13]:
cores = multiprocessing.cpu_count()
model = Word2Vec(
    sentences=question_titles_df,
    min_count=1,
    workers=cores - 1,
)
model.save(join(MODELS_DIR, "word2vec.model"))

## Topic Modelling

In [14]:
question_titles_df = raw_questions_df["Title"]
question_titles_df.head()

0                                How to access the last value in a vector?
1                      Worse sin: side effects or passing massive objects?
2                                     Explain the quantile() function in R
3                                       How to test for the EOF flag in R?
4    Is there an R package for learning a Dirichlet prior from counts data
Name: Title, dtype: object

In [15]:
parser = English()


def tokenize(text: str):
    lda_tokens: list[str] = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        if token.like_url:
            lda_tokens.append("URL")
        elif token.orth_.startswith("@"):
            lda_tokens.append("SCREEN_NAME")
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [16]:
def get_lemma(word: str):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    return lemma

In [17]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [18]:
tokenized_question_titles = question_titles_df.apply(prepare_text_for_lda)
tokenized_question_titles.head()

0                         [access, value, vector]
1      [worse, effects, passing, massive, object]
2                   [explain, quantile, function]
3                                              []
4    [package, learning, dirichlet, prior, count]
Name: Title, dtype: object

In [19]:
dictionary = Dictionary(documents=tokenized_question_titles)
corpus = tokenized_question_titles.apply(dictionary.doc2bow)
pickle.dump(corpus, open(join(MODELS_DIR, "corpus.pkl"), "wb"))
dictionary.save(join(MODELS_DIR, "dictionary.gensim"))

> Briefly specify how you [picked] the number of topics.

I picked 5 as the number of topics because it was the first number used in [this tutorial I followed](https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21#afa6).

In [20]:
NUM_TOPICS = 5
ldamodel = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary)
ldamodel.save(join(MODELS_DIR, "lda_model_5.gensim"))
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.100*"column" + 0.052*"values" + 0.033*"value" + 0.031*"frame"')
(1, '0.054*"shiny" + 0.045*"create" + 0.044*"error" + 0.043*"using"')
(2, '0.065*"multiple" + 0.039*"ggplot2" + 0.035*"ggplot" + 0.029*"plot"')
(3, '0.084*"variable" + 0.030*"group" + 0.026*"using" + 0.026*"calculate"')
(4, '0.083*"function" + 0.030*"error" + 0.025*"using" + 0.021*"object"')


## Summmary

In [21]:
dictionary = Dictionary.load(join(MODELS_DIR, "dictionary.gensim"))
corpus = pickle.load(open(join(MODELS_DIR, "corpus.pkl"), "rb"))
lda = LdaModel.load(join(MODELS_DIR, "lda_model_5.gensim"))
lda_display = gensimvis.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
