In [133]:
# Code havily referenced from TDS posts on LDA
# https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization/execution
# Gensim Sample Code

In [134]:
# Imports

from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
import nltk
nltk.download('stopwords')
import pandas as pd
import re
import math

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [135]:
!pip install tweet-preprocessor
!pip install icecream
import icecream as ic
import preprocessor as p



In [136]:
tweets = pd.read_csv("labeled_tweets.csv")
tweets.text = tweets.text.str.lower().apply(p.clean)
tweets.fillna("NA", inplace=True)

In [137]:
# tweets = tweets[tweets.label1 == "politics"] # Subsetting to check for change in output

In [138]:
docs = list(tweets.text)

In [139]:
docs # a list of tweet texts2

['while athletes and other attendees at this summers tokyo olympic games will not be required to be vaccinated or to quarantine on arrival, they will be subject to severe restrictions on movement and socializing.',
 'the family of a military veteran who died in after he was bitten all over his body by fire ants while at a veterans affairs facility in atlanta filed a wrongful-death lawsuit this week against the u.s. government and a pest control company.',
 'a state lawmaker in missouri was charged this week in connection with a fraud scheme in which she claimed she could use stem cells to treat covid-19 patients at her medical clinics, prosecutors said.',
 'u.s. defense secretary lloyd austin convened the military chiefs and civilian secretaries of the armed forces on wednesday to begin intensifying the pentagons efforts to combat white supremacy and right-wing extremism in the ranks.',
 'who knew that it could be so much fun to watch mandy patinkin and kathryn grody, married for years

In [140]:
# Gensim Sample Code that tokenizes text

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = tokenizer.tokenize(str(docs[idx]))  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [141]:
# Removes stopwords from the list of sentences
stoppers = stopwords.words('english')
re_stop =  lambda x: [item for item in x if item not in stoppers]
docs = list(map(re_stop, docs)) # removes stopwords from the list of sentences

In [142]:
docs # after removing stopwords

[['athletes',
  'attendees',
  'summers',
  'tokyo',
  'olympic',
  'games',
  'required',
  'vaccinated',
  'quarantine',
  'arrival',
  'subject',
  'severe',
  'restrictions',
  'movement',
  'socializing'],
 ['family',
  'military',
  'veteran',
  'died',
  'bitten',
  'body',
  'fire',
  'ants',
  'veterans',
  'affairs',
  'facility',
  'atlanta',
  'filed',
  'wrongful',
  'death',
  'lawsuit',
  'week',
  'government',
  'pest',
  'control',
  'company'],
 ['state',
  'lawmaker',
  'missouri',
  'charged',
  'week',
  'connection',
  'fraud',
  'scheme',
  'claimed',
  'could',
  'use',
  'stem',
  'cells',
  'treat',
  'covid',
  'patients',
  'medical',
  'clinics',
  'prosecutors',
  'said'],
 ['defense',
  'secretary',
  'lloyd',
  'austin',
  'convened',
  'military',
  'chiefs',
  'civilian',
  'secretaries',
  'armed',
  'forces',
  'wednesday',
  'begin',
  'intensifying',
  'pentagons',
  'efforts',
  'combat',
  'white',
  'supremacy',
  'right',
  'wing',
  'extremis

In [143]:
# Gensim code to lemmatize text using NLTK
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [144]:
# Gensim code tweaked just a bit to help short texts

from gensim.models import Phrases

# Add bigrams and trigrams to docs (even if they appear once)
bigram = Phrases(docs, min_count=1)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)


For a faster implementation, use the gensim.models.phrases.Phraser class



In [145]:
# Create a dictionary representation of the documents. And then BOW. Gensim sample code

from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [146]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1)],
 [(15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 2),
  (34, 1),
  (35, 1),
  (36, 1)],
 [(35, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1)],
 [(30, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 2),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1)],
 [(42, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88

In [147]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 3307
Number of documents: 545


In [148]:


from gensim.models import LdaModel

# Set training parameters.
num_topics = 1
chunksize = 600
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [149]:
# Gensim sample code to observe mix of top topics

top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -9.9326.
[([(0.005179982, 'new'),
   (0.0047763893, 'trump'),
   (0.0041678436, 'vaccine'),
   (0.0040660193, 'year'),
   (0.003964068, 'said'),
   (0.0037597842, 'biden'),
   (0.0034523443, 'one'),
   (0.0032466552, 'say'),
   (0.0031435771, 'president'),
   (0.0031435771, 'capitol'),
   (0.003143577, 'coronavirus'),
   (0.0030403368, 'covid'),
   (0.0030403368, 'house'),
   (0.002729571, 'opinion'),
   (0.002521456, 'pandemic'),
   (0.0024170915, 'republican'),
   (0.0024170915, 'police'),
   (0.0022077002, 'city'),
   (0.0022077002, 'state'),
   (0.0021026519, 'senate')],
  -9.93261435961424)]


Setting up methods to visualize topics in 2D and 3D

In [150]:
from collections import OrderedDict
import numpy as np
def get_doc_topic_dist(model, corpus, kwords=False):
    
    '''
    LDA transformation, for each doc only returns topics with non-zero weight
    This function makes a matrix transformation of docs in the topic space.
    '''
    top_dist =[]
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [np.array(vals)]
        if kwords:
            keys += [np.array(vals).argmax()]

    return np.array(top_dist), keys

In [151]:
top_dist, lda_keys= get_doc_topic_dist(model, corpus, True)

In [152]:
# tSNE in 3D, change n_components=2 for 2d Viz

from sklearn.manifold import TSNE
tsne = TSNE(n_components=3)
X_tsne = tsne.fit_transform(top_dist,)

In [153]:
import plotly.express as px

In [154]:
fig = px.scatter_3d(
    X_tsne, x=0, y=1, z=2,
    color=tweets.label1
)
fig.update_traces(marker_size=8)
fig.show()

In [155]:
# 2D tSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(top_dist,)

In [156]:
tweets['X_tsne'] =X_tsne[:, 0]
tweets['Y_tsne'] =X_tsne[:, 1]

In [157]:
tweets

Unnamed: 0,text,label1,label2,X_tsne,Y_tsne
0,while athletes and other attendees at this sum...,sports,health,0.539540,-0.229945
1,the family of a military veteran who died in a...,politics,,0.539518,-0.229743
2,a state lawmaker in missouri was charged this ...,health,politics,0.539544,-0.229764
3,u.s. defense secretary lloyd austin convened t...,politics,,0.539540,-0.229945
4,who knew that it could be so much fun to watch...,politics,entertainment,0.539544,-0.229764
...,...,...,...,...,...
540,elon musks spacex announces a spaceflight inte...,business,,-0.123764,-0.555132
541,"as house gop faces decision on its future, mcc...",politics,,-0.622949,-0.287561
542,the gamestop stock situation isnt about populi...,business,,1.994967,0.316848
543,"opinion: confederate names are coming down, bu...",politics,,0.643599,-1.275312


In [158]:
fig = px.scatter(
    tweets, x=tweets.X_tsne, y=tweets.Y_tsne,
    color=tweets.label1, labels=tweets.text
)

In [159]:
fig.show()