In [1]:
# Import necessary libraries. 
import pandas as pd
import numpy as np
from datetime import datetime
import json
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import random
import scipy.sparse
from datasets import load_dataset, Dataset
import gensim
from collections import Counter
from textblob import TextBlob


In [2]:
# Upload processed dataset 
import_data = load_dataset("bartoszmaj/nouns_full")
import_df = pd.DataFrame({"body":import_data["train"]["nouns"]})
nouns_processed = import_df["body"].to_list() 

Found cached dataset parquet (/home/ec22283/.cache/huggingface/datasets/bartoszmaj___parquet/bartoszmaj--nouns_full-0772e1f339b3eb33/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
# Imports gensim models
from gensim import models

In [4]:
# Generates gensim dictionary. 
dictionary = gensim.corpora.Dictionary(nouns_processed)

In [5]:
# Converts each comment to gensim bag-of-words format. 
corpus = [dictionary.doc2bow(doc) for doc in tqdm(nouns_processed)]

  0%|          | 0/4600698 [00:00<?, ?it/s]

In [6]:
# Imports visualisation tool 
import pyLDAvis.gensim_models

In [7]:
pyLDAvis.enable_notebook()

In [8]:
# Sets up logging to displayt gensim progress. 
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

The following resource was used to inform the methodology and code used for topic modelling in Gensim: https://radimrehurek.com/gensim/wiki.html

In [9]:
# Sets up LDA model and sets the parameters. 
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(corpus, 
               num_topics=7, 
               id2word = dictionary, 
               passes=1, 
               iterations=200, 
               chunksize = 10000, 
               eval_every = None,
               update_every = 1, 
               random_state = 42)

2023-08-23 04:52:37,373 : INFO : using symmetric alpha at 0.14285714285714285
2023-08-23 04:52:37,374 : INFO : using symmetric eta at 0.14285714285714285
2023-08-23 04:52:37,377 : INFO : using serial LDA version on this node
2023-08-23 04:52:37,399 : INFO : running online (single-pass) LDA training, 7 topics, 1 passes over the supplied corpus of 4600698 documents, updating model once every 10000 documents, evaluating perplexity every 0 documents, iterating 200x with a convergence threshold of 0.001000
2023-08-23 04:52:37,403 : INFO : PROGRESS: pass 0, at document #10000/4600698
2023-08-23 04:52:43,255 : INFO : merging changes from 10000 documents into a model of 4600698 documents
2023-08-23 04:52:43,302 : INFO : topic #3 (0.143): 0.013*"country" + 0.013*"world" + 0.012*"thing" + 0.011*"issue" + 0.009*"government" + 0.009*"year" + 0.008*"point" + 0.007*"way" + 0.007*"emission" + 0.007*"look"
2023-08-23 04:52:43,304 : INFO : topic #4 (0.143): 0.011*"time" + 0.010*"year" + 0.008*"problem"

In [10]:
# Visualisation of generated topics. 
vis = pyLDAvis.gensim_models.prepare(ldamodel,
                                     corpus,
                                     dictionary = dictionary)

In [11]:
# Displays visualisation. 
vis

In [12]:
# Saves lda.model 
#ldamodel.save('lda.ldamodel')

In [13]:
topic_data = vis

In [14]:
# Shows the words for the given topics given a specific lambda value. 
all_topics = {}
num_terms = 10
lambd = 0.35 
for i in range(1,8):
    topic = topic_data.topic_info[topic_data.topic_info.Category == 'Topic'+str(i)].copy()
    topic['relevance'] = topic['loglift']*(1-lambd)+topic['logprob']*lambd
    all_topics['Topic '+str(i)] = topic.sort_values(by='relevance', ascending=False).Term[:num_terms].values

The code used for implementing the lambda value comes from the following source: https://nicharuc.github.io/topic_modeling/ 

In [15]:
# Displaying the topic words.
pd.DataFrame(all_topics).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic 1,world,population,food,life,society,war,meat,death,capitalism,humanity
Topic 2,energy,oil,cost,fuel,company,industry,market,carbon,coal,emission
Topic 3,party,vote,president,policy,trump,election,law,obama,woman,state
Topic 4,thing,argument,fact,point,man,isn,opinion,debate,way,problem
Topic 5,temperature,ice,weather,water,sea,year,model,heat,earth,trend
Topic 6,science,comment,reddit,article,consensus,scientist,news,study,paper,research
Topic 7,game,movie,day,week,hour,story,book,night,moon,film


In [16]:
#ldamodel = models.ldamodel.LdaModel.load('/home/ec22283/myenv01/lib/python3.9/site-packages/lda.ldamodel')

In [17]:

t = ldamodel.get_document_topics(bow = corpus[1])

In [18]:
# Generates the topic probabilities for each 
topic_probabilities = [ldamodel.get_document_topics(doc) for doc in tqdm(corpus)]    

  0%|          | 0/4600698 [00:00<?, ?it/s]

In [19]:
# Get the topic for each document by selecting the index of the largest probability value. 
topics = []
for distribution in tqdm(topic_probabilities):    
    probs = []
    for value in distribution:
        probs.append(value[1])
    topics.append(probs.index(max(probs)))

  0%|          | 0/4600698 [00:00<?, ?it/s]

In [20]:
# Generate the length for each comment. 
process_lengths = []
for i in range(0, 1000):    
    process_lengths.append(len(nouns_processed[i]))

In [21]:
# Generate the length for each comment in the corpus format. 
corpus_lengths = []
for d in range(0, 1000):    
    corpus_lengths.append(sum([corpus[d][i][1] for i in range(0, len(corpus[d]))]))

In [22]:
# Compare the two lists to ensure the corpus preserved the order of the comments in the processed list. 
process_lengths == corpus_lengths

True

In [23]:
len(topics)

4600698

In [24]:
comments_df = pd.read_csv("comments.csv")

In [25]:
# Add topics to the comments dataset. 
comments_df["topics"] = topics

In [26]:
# Export to huggingface. 
#topic_subset = Dataset.from_pandas(comments_df)
#topic_subset.push_to_hub("bartoszmaj/topics_labelled")

In [27]:
#import_data = load_dataset("bartoszmaj/topics_labelled")

In [28]:
#import_data