In [3]:
import numpy as np
import pandas as pd

import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS

from tqdm import tqdm_notebook as tqdm

In [7]:
from spacy.lang.en import English
nlp = spacy.load('en_core_web_sm')

In [8]:
# Jonathan Keller from https://towardsdatascience.com/building-a-topic-modeling-pipeline-with-spacy-and-gensim-c5dc03ffc619
def remove_stopwords(doc):
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [17]:
clean_tweets = pd.read_csv('cleaned_covid_tweets.csv')
clean_tweets.columns = ['index', 'tweet', 'user', 'id', 'symbols', 'url', 'date']
clean_tweets['tweet']

0         Iowa averaged a pandemic high new Covid infect...
1          TheAngel_Plays SweetPxtatoe kiararoyale Megan...
2          baxterberrie 4x this many americans died from...
3          tonim57601 MarlenaStell Covid-19 Id hate to t...
4         Is it covid effect Or something else is going on 
                                ...                        
106674     sibyllete Lol You can have a ballot mailed to...
106675     idea of a nonprofit taking a lead role in set...
106676     realDonaldTrump Regardless of the outcome of ...
106677     Walmart I love the way you have face covering...
106678     ARTUZ_teachers RMajongwe ProgressiveOf Nyombw...
Name: tweet, Length: 106679, dtype: object

In [21]:
doc_list = []
# Iterates through each article in the corpus.
for i in range(len(clean_tweets)):
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(clean_tweets.loc[i, 'tweet'])
    doc_list.append(pr)

In [22]:
doc_list[:5]

[['Iowa',
  'averaged',
  'pandemic',
  'high',
  'new',
  'Covid',
  'infections',
  'day',
  'past',
  'week',
  'Iowans',
  'hospitalized',
  'Covid',
  'tonight',
  'ICU'],
 [' ',
  'TheAngel_Plays',
  'SweetPxtatoe',
  'kiararoyale',
  'MeganPlays',
  'Change',
  'change',
  'going',
  's',
  'reelected',
  'Covid-19',
  'cases',
  'deaths'],
 [' ',
  'baxterberrie',
  '4x',
  'americans',
  'died',
  'covid',
  'people',
  'took',
  'time',
  'effort',
  'vote',
  'waste',
  'bastards'],
 [' ',
  'tonim57601',
  'MarlenaStell',
  'Covid-19',
  'd',
  'hate',
  'think',
  'outcome',
  'twins',
  'monitored',
  'closely',
  'easy',
  'Biden',
  've',
  'better',
  'expected',
  'wildest',
  'dreams'],
 ['covid', 'effect', 'going']]

In [23]:
import gensim.corpora as corpora
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)
# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [25]:
lda_model.print_topics(num_words = 30)

[(0,
  '0.039*"mask" + 0.018*"wear" + 0.014*"masks" + 0.012*"wearing" + 0.011*"face" + 0.008*"🙏" + 0.008*"Dalton" + 0.007*"active" + 0.007*"mental" + 0.007*"Andy" + 0.006*"article" + 0.006*"miss" + 0.006*"ill" + 0.006*"cold" + 0.006*"beds" + 0.005*"common" + 0.005*"spike" + 0.005*"Cowboys" + 0.005*"amid" + 0.005*"experience" + 0.005*"distance" + 0.005*"teams" + 0.004*"child" + 0.004*"worked" + 0.004*"services" + 0.004*"5" + 0.004*"virtual" + 0.004*"updates" + 0.004*"Find" + 0.004*"PPE"'),
 (1,
  '0.017*"😂" + 0.014*"CDC" + 0.011*"🤣" + 0.009*"quarantine" + 0.009*"important" + 0.009*"piersmorgan" + 0.008*"highest" + 0.008*"low" + 0.007*"🇺" + 0.006*"🇸" + 0.006*"Canada" + 0.006*"Thank" + 0.006*"worst" + 0.006*"sense" + 0.006*"contact" + 0.005*"communities" + 0.005*"higher" + 0.005*"allowed" + 0.005*"England" + 0.005*"😷" + 0.004*"rates" + 0.004*"🏻" + 0.004*"vs" + 0.004*"lack" + 0.004*"poor" + 0.004*"guy" + 0.004*"experts" + 0.004*"Learn" + 0.004*"info" + 0.004*"poll"'),
 (2,
  '0.022*"U" + 0

It feels like there are a few topics in here that make sense, but I'm not sure I "see" what a lot of it is. This does very much feel like it's essentially a snapshot of the specific day when I pulled the tweets.

Next steps:
1. Grab another chunk of tweets using Tweepy; possibly grab a third chunk from another point or three in time.
2. Figure out model perplexity/coherence so I know when I'm improving things; implement pyLDAvis so I can see how things are looking.
3. Replace the letter icons used for flag emojis (🇺 and 🇸 with us_flag, since that's what they represent)
4. Lowercase words - seems unlikely that capitalization is going to make a difference
5. Leave ' @\w' in place; delete ' @ ' ( == ' at ' == stop word)? Seems possible that some highly relevant Twitter handles may mean different things as hastag/plaintext vs handle.
6. How does LDA train with this cleaning scheme perform pre-removal of stop words?
7. Pull out stop words, including days of the week and day referents (today, tomorrow, yesterday)
8. How does LDA perform after stop words are removed?
9. Compare lemmatized with non-lemmatized results - is avoiding lemmatization actually making any difference?