# Topic Modeling

In [1]:
from tqdm.notebook import tqdm
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import gensim.corpora as corpora
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

from utilities import get_nodeid2text

tqdm.pandas()

In [2]:
# configure stop words (words we exclude)
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/benlimpa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# load raw title + abstract data
nodeid2text, (train_idx, valid_idx, test_idx) = get_nodeid2text()

In [4]:
# preprocess the text to split into words/tokens
nodeid2text = nodeid2text.assign(
    words=nodeid2text["text"].progress_apply(
        lambda mystr: gensim.utils.simple_preprocess(mystr, deacc=True)
    )
)

  0%|          | 0/169343 [00:00<?, ?it/s]

In [5]:
# remove stop words from our list of words for each paper
def remove_stopwords(words):
    return [word for word in words if word not in stop_words]


nodeid2text = nodeid2text.assign(
    words_clean=nodeid2text["words"].progress_apply(remove_stopwords)
)

  0%|          | 0/169343 [00:00<?, ?it/s]

In [6]:
# create a dictionary of words from our training set
# this maps token ids to unique words
id2word = corpora.Dictionary(nodeid2text["words_clean"].loc[train_idx])

In [7]:
# Create a corpus of words from the filtered list of words for each paper. A
# corpus is a list of tuples with the token id and the number of times the token
# appears
corpus = [id2word.doc2bow(text) for text in nodeid2text.loc[train_idx]["words_clean"]]

In [8]:
# create a latent dirichlet allocation model from our training data
# we arbitarily set the number of topics to be 10
lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=40)

In [9]:
# now we apply this model to our entire dataset
gammas, _ = lda_model.inference(
    [id2word.doc2bow(text) for text in nodeid2text["words_clean"]]
)

In [10]:
# train a classifier on the output of the LDA model
#scaled_gammas = preprocessing.StandardScaler().fit_transform(gammas)
logistic_clf = LogisticRegression(random_state=0).fit(
    gammas[train_idx], nodeid2text.iloc[train_idx]["label"]
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# test the model
train_acc = logistic_clf.score(gammas[train_idx], nodeid2text.iloc[train_idx]["label"])
valid_acc = logistic_clf.score(gammas[valid_idx], nodeid2text.iloc[valid_idx]["label"])
test_acc = logistic_clf.score(gammas[test_idx], nodeid2text.iloc[test_idx]["label"])
print(f"Training Accuracy: {train_acc}")
print(f"Validation Accuracy: {valid_acc}")
print(f"Test Accuracy: {test_acc}")

Training Accuracy: 0.5067021475462112
Validation Accuracy: 0.5224000805396154
Test Accuracy: 0.5052774520091352
