1. Set constants, and set up tweet scraping

In [21]:
import tweepy
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from bs4 import BeautifulSoup
import gensim
from gensim.utils import simple_preprocess

# "true tweets", "second filter", "no filter"
MODE = "no filter"

CONSUMER_KEY = "*********************"
CONSUMER_SECRET = "*********************"
ACCESS_TOKEN = "*********************"
ACCESS_TOKEN_SECRET = "*********************"

SEARCH_WORDS_LIST = ["covid vaccine unsafe", "covid vaccine already had covid", "covid vaccine choice", "covid vaccine freedom",
                     "covid vaccine ineffective", "covid vaccine not serious health risk", "covid vaccine no access", "covid vaccine dangerous",
                     "covid fake", "covid vaccine already infected", "covid vaccine unproven", "covid vaccine dangerous side effects", "covid vaccine natural immunity"]

SECOND_FILTER = ["vacci", "vax"] # One of these words needs to be in the tweet

STOP_WORDS = stopwords.words('english')


def initialClean(text):
  text = BeautifulSoup(text, "lxml").text
  
  # Get rid of numbers
  newText = ''.join([i for i in text if not i.isdigit()])
  newText = newText.replace("_", " ")
  
  # Tokenize string, and remove punctuation
  tokens = simple_preprocess(newText, deacc=True)

  # Get rid of stopwords
  for word in tokens:
    if word in STOP_WORDS:
      tokens.remove(word)

  #print("\n\nTokens: ", tokens)
  
  return tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2. Generate the allTweetsArray

In [22]:
import pandas as pd
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

allTweetsArray = []

try:
  api.verify_credentials()
  print("Authentication OK")
except:
  print("Error during authentication")


# -------- Get data from LiveStreamedTweets.csv --------
csvDf = pd.read_csv('LiveStreamedTweets.csv', delimiter=',')
csvNumpyArray = csvDf['Text'].to_numpy()

# ----------------- Attempting text cleaning -----------------

# Initialize counter
countTweets = 0

# How many tweets have the search words
trueTweetCounter = 0

# Set up the sentiment analyzer
analyser = SentimentIntensityAnalyzer()

while countTweets < len(csvNumpyArray):

  # Tweet positive sentiment score has to be equal or below 0.4
  score = analyser.polarity_scores(csvNumpyArray[countTweets])
  if score['pos'] > 0.4:
    countTweets += 1
    continue

  flag = 0
  for sw in SEARCH_WORDS_LIST:
    count = 0
    sw_words = sw.split()
    for word in sw_words:
      if word in csvNumpyArray[countTweets]:
        count += 1
        if count == len(sw_words):
          #print("\n\nSW: ", sw)
          #print("------------------------------")
          #print("This tweet contains all words in a search term.")
          #print(csvNumpyArray[countTweets])
          #print("Word: ", word)
          trueTweetCounter += 1
          #print("trueTweetCounter: ", trueTweetCounter)
          #print("------------------------------")
          
          if MODE == "true tweets":
            csvNumpyArray[countTweets] = initialClean(csvNumpyArray[countTweets])
            allTweetsArray.append(csvNumpyArray[countTweets])
          
          flag = 1
    if flag == 1:
      break

  if MODE == "second filter":
    for word in SECOND_FILTER:
      if word in csvNumpyArray[countTweets]:
        #print("\n\n{} in {}".format(word, csvNumpyArray[countTweets]))
        #print(word in csvNumpyArray[countTweets])
        csvNumpyArray[countTweets] = initialClean(csvNumpyArray[countTweets])
        allTweetsArray.append(csvNumpyArray[countTweets])
        break
  
  elif MODE == "no filter":
    csvNumpyArray[countTweets] = initialClean(csvNumpyArray[countTweets])
    allTweetsArray.append(csvNumpyArray[countTweets])
  
  countTweets += 1

print("\n\n----------------- allTweetsArray -----------------")
print(allTweetsArray)
print("\nLength of allTweetsArray: ", len(allTweetsArray))
print("\nOriginal Length: ", len(csvDf))
print("\ntrueTweetCounter = ", trueTweetCounter)

Authentication OK


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3. More in depth text preparation (lemmatization, bigrams)

In [23]:
import spacy

bigram = gensim.models.Phrases(allTweetsArray, min_count=1, threshold=3)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Define functions bigrams and lemmatization
def make_bigrams(texts):
  return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
  texts_out = []
  for sent in texts:
    doc = nlp(" ".join(sent))
    texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
  return texts_out

# Form Bigrams
bigram_tweets = make_bigrams(allTweetsArray)
print("\n\nbigram_tweets: ", bigram_tweets[:1][0])

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(bigram_tweets)

print("\n\ndata_lemmatized: ", data_lemmatized[:1][0])
print("\n\nallTweetsArray: ", allTweetsArray[:1][0])



bigram_tweets:  ['lizziethelawyer_adriennevose', 'joebiden_unvaccinated', 'already', 'covid', 'my', 'natural_immunity', 'better_your', 'https_co', 'mifsxcdkh']


data_lemmatized:  ['already', 'covid', 'mifsxcdkh']


allTweetsArray:  ['lizziethelawyer', 'adriennevose', 'joebiden', 'unvaccinated', 'already', 'covid', 'my', 'natural', 'immunity', 'better', 'your', 'https', 'co', 'mifsxcdkh']


4. Compute multiple LDA models and get their coherence models

In [24]:
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from pprint import pprint
import tqdm
import numpy as np

# Creates an LDA model, and build a coherence model
def compute_coherence_values(corpus, dictionary, k, b):
  lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=dictionary,
                                          num_topics=k, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=10,
                                          eta=b)
  
  coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
  
  return coherence_model_lda.get_coherence()

# -------- Find the ideal LDA model with the best parameters --------
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

# Topics range
min_topics = 7
max_topics = 13
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Beta parameter
beta = list(np.arange(0.01, 0.05, 0.1))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if True:
  pbar = tqdm.tqdm(total=(len(beta)*len(topics_range)*len(corpus_title)))
  
  # iterate through validation corpuses
  for i in range(len(corpus_sets)):
    # iterate through number of topics
    for k in topics_range:
      # iterare through beta values
      for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=k, b=b)
        # Save the model results
        model_results['Validation_Set'].append(corpus_title[i])
        model_results['Topics'].append(k)
        model_results['Beta'].append(b)
        model_results['Coherence'].append(cv)
            
        pbar.update(1)
  
  pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
  pbar.close()

  diff = np.log(self.expElogbeta)
100%|██████████| 24/24 [20:29<00:00, 51.23s/it]


5. Get the attributes of the best model and train it

In [25]:
# Get the attributes of the best model
max_index = model_results['Coherence'].index(max(model_results['Coherence']))

best_model_info = {'Validation_Set': model_results['Validation_Set'][max_index],
                   'Topics': model_results['Topics'][max_index],
                   'Beta': model_results['Beta'][max_index],
                   'Coherence': model_results['Coherence'][max_index]
                  }

print("\n\nBest Model Info: ", best_model_info)

# Train the model with the best parameters
best_lda_model_bigram = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=best_model_info['Topics'], 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           eta=best_model_info['Beta'])

pprint(best_lda_model_bigram.print_topics())
doc_lda = best_lda_model_bigram[corpus]



Best Model Info:  {'Validation_Set': '100% Corpus', 'Topics': 8, 'Beta': 'symmetric', 'Coherence': 0.4536343421622446}
[(0,
  '0.041*"vaccine" + 0.020*"life" + 0.020*"covid" + 0.019*"worker" + '
  '0.018*"will" + 0.016*"spread" + 0.015*"would" + 0.014*"dangerous" + '
  '0.014*"work" + 0.014*"mandate"'),
 (1,
  '0.175*"covid" + 0.131*"proof_natural" + 0.128*"coronavirus_had" + '
  '0.128*"drive_vaccine" + 0.033*"incredibly_dangerous" + 0.032*"shopper" + '
  '0.031*"listening_shopprsdrugmart" + 0.020*"collins_trucke" + '
  '0.020*"mkfmgdg" + 0.020*"never_mandate"'),
 (2,
  '0.198*"vaccine" + 0.121*"covid" + 0.074*"natural_immunity" + 0.062*"choice" '
  '+ 0.054*"get" + 0.011*"can" + 0.010*"so_thankful" + 0.010*"going_give" + '
  '0.010*"misinformation" + 0.010*"child"'),
 (3,
  '0.122*"tissues_include" + 0.122*"post_vaccination" + '
  '0.122*"accumulates_organ" + 0.122*"vaccine_gets" + 0.121*"txpqeaauz" + '
  '0.029*"vaccinate" + 0.010*"breast_milk" + 0.010*"yqptcxdlou" + 0.010*"risk" 