In [1]:
import sqlite3
import pandas as pd
import numpy as np
import re
import unicodedata
from datetime import datetime, timedelta

from string import punctuation

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from gensim import corpora
# from gensim.models import LdaMulticore
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.test.utils import common_corpus
from gensim.models.coherencemodel import CoherenceModel

from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\epicp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\epicp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\epicp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\epicp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing

In [2]:
sample_path = r'dfs\2020-03-22-to-2020-11-18-1000-daily'
dtm_out_path = 'dtm' + sample_path[3:]

  and should_run_async(code)


### Sample and save
DO NOT run this block if the pickled DataFrame already exists

In [None]:
conn = sqlite3.connect('database/tweets.db')

# sample
df = pd.read_sql_query("select * from tweets where id_str in (select id_str from tweets where created_at between '2020-03-22' and '2020-11-18' order by random() limit 605000)", conn)
df.to_pickle(sample_path)

### Preprocessing the sample

In [5]:
# regex from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
url_re = re.compile(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})")
mentions_re = re.compile(r"@\w*")
hashtag_re = re.compile(r"#\w*")
stopword_list = stopwords.words('english')
wnl = WordNetLemmatizer()

# From https://www.kaggle.com/alvations/basic-nlp-with-nltk#Stemming-and-Lemmatization
def penn2morphy(penntag):
    # Converts Penn Treebank tags to WordNet.
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' # if mapping isn't found, fall back to Noun.

def preprocess(text):
    text = url_re.sub('', text)
    text = mentions_re.sub('', text)
    text = hashtag_re.sub('', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # Tokenization and word removal
    words = word_tokenize(text)
    words = map(lambda w: w.lower(), words)
    words = filter(lambda w: w not in stopword_list and w not in punctuation and len(w) >= 3, words)
    words = list(words)

    # POS tagging and lemmatization
    tagged_words = pos_tag(words)
    lemmatized = [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) for word, tag in tagged_words]
    lemmatized = list(filter(lambda w: not any(p in w for p in punctuation) and w not in stopword_list and w not in punctuation and len(w) >= 3, lemmatized))
    return lemmatized

def timestamp():
    return datetime.now().strftime('%x %X')

  and should_run_async(code)


In [3]:
df = pd.read_pickle(sample_path)

  and should_run_async(code)


The below code block preprocesses the Tweets in df and takes ~25 minutes for 600,000 tweets.

In [6]:
start = datetime.now()
texts = [preprocess(text) for text in df['full_text']]
print('Time to preprocess Tweets:', str(datetime.now() - start))

  and should_run_async(code)


Time to preprocess Tweets: 0:06:58.843480


In [7]:
with open(r'dtm\full-preprocessed-pickle', 'wb') as f:
    pickle.dump(texts, f)

  and should_run_async(code)


# Model training

### Setup

In [8]:
# get time slices (number of tweets each day)
time_slices = df['created_at'].apply(lambda x: x[:10]).value_counts().sort_index().values.tolist()

  and should_run_async(code)


In [9]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
dtm_exe_path = r'C:\Program Files\DTM\dtm-win64.exe'

  and should_run_async(code)


### Training

In [11]:
print('Model started training at {}'.format(timestamp()))
start = datetime.now()
dtm_model = DtmModel(dtm_exe_path, corpus=corpus[:], time_slices=time_slices, num_topics=20, id2word=dictionary)
elapsed = datetime.now() - start
print('Model finished training at {}'.format(timestamp()))
print('Elapsed time:', elapsed)

print('Saving model...')
dtm_model.save(dtm_out_path)
print('Code block finished at {}'.format(timestamp()))

  and should_run_async(code)


Model started training at 11/21/20 18:21:38


CalledProcessError: Command '['C:\\Program Files\\DTM\\dtm-win64.exe', '--ntopics=20', '--model=dtm', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=C:\\Users\\epicp\\AppData\\Local\\Temp\\8dbb68_train', '--outname=C:\\Users\\epicp\\AppData\\Local\\Temp\\8dbb68_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--top_chain_var=0.005', '--rng_seed=0']' returned non-zero exit status 3.

# Inspection of DTM topics
Using 20 topics, as determined by roughly maximizing $c_v$ coherence on the first 24 hours of Tweets.

At a minimum, you will need to run everything above except for "Sample and save" and "Training".

In [10]:
dtm_model = DtmModel.load(dtm_out_path)

  and should_run_async(code)


In [29]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(time=240, corpus=corpus)
prepared = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency, sort_topics=False)
pyLDAvis.display(prepared)

  and should_run_async(code)


In [14]:
cumulative_time_slices = [sum(time_slices[:i]) for i in range(len(time_slices) + 1)]

  and should_run_async(code)


In [18]:
# look at coherence of each time slice
for i in range(len(time_slices)):
    topics_dtm = dtm_model.dtm_coherence(time=i)
    cm_DTM = CoherenceModel(topics=topics_dtm, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
    print('({})'.format(timestamp()), 'Time:', i, '\tCoherence:', cm_DTM)

  and should_run_async(code)


(11/25/20 16:41:20) Time: 0 	Coherence: 0.46164611869111305
(11/25/20 16:41:41) Time: 1 	Coherence: 0.4636053567144357
(11/25/20 16:42:06) Time: 2 	Coherence: 0.46328825049416233
(11/25/20 16:42:29) Time: 3 	Coherence: 0.46514226353674903
(11/25/20 16:42:55) Time: 4 	Coherence: 0.4663914387553626
(11/25/20 16:43:21) Time: 5 	Coherence: 0.46533493034256257
(11/25/20 16:43:47) Time: 6 	Coherence: 0.46496940594306874
(11/25/20 16:44:14) Time: 7 	Coherence: 0.46780130962520045
(11/25/20 16:44:41) Time: 8 	Coherence: 0.4668481621666163
(11/25/20 16:45:11) Time: 9 	Coherence: 0.46270295465017686
(11/25/20 16:45:39) Time: 10 	Coherence: 0.4612367910870871
(11/25/20 16:46:10) Time: 11 	Coherence: 0.4653035324322182
(11/25/20 16:46:40) Time: 12 	Coherence: 0.4612725688094475
(11/25/20 16:47:09) Time: 13 	Coherence: 0.46161005794351945
(11/25/20 16:47:35) Time: 14 	Coherence: 0.45848501578492307
(11/25/20 16:48:03) Time: 15 	Coherence: 0.46008975222756837
(11/25/20 16:48:34) Time: 16 	Coherence:

In [11]:
# pyLDAvis.save_html(prepared, r'dtm\2020-06-05-150-day-vis.html')

  and should_run_async(code)


# Testing coherence on held-out data

In [20]:
conn = sqlite3.connect('database/tweets.db')

# sample over all days
df_held_out = pd.read_sql_query("select * from tweets where id_str in (select id_str from tweets where created_at between '2020-03-22' and '2020-11-18' order by random() limit 241000)", conn)

  and should_run_async(code)


In [21]:
# remove tweets in the sample that were also used for training
temp = pd.merge(df, df_held_out, how='outer', on='id_str', indicator=True)
training_ids = temp.loc[temp._merge == 'both']['id_str']
df_held_out = df_held_out[~df_held_out.id_str.isin(training_ids)]

  and should_run_async(code)


In [22]:
# preprocess and generate relevant structures needed for coherence analysis
texts_test = [preprocess(text) for text in df_held_out['full_text']]
dictionary_test = corpora.Dictionary(texts_test)
corpus_test = [dictionary_test.doc2bow(text) for text in texts_test]

  and should_run_async(code)


In [23]:
# look at coherence of each time slice
for i in range(len(time_slices)):
    topics_dtm = dtm_model.dtm_coherence(time=i)
    cm_DTM = CoherenceModel(topics=topics_dtm, texts=texts_test, dictionary=dictionary_test, coherence='c_v').get_coherence()
    print('({})'.format(timestamp()), 'Time:', i, '\tCoherence:', cm_DTM)

  and should_run_async(code)


(11/25/20 23:38:39) Time: 0 	Coherence: 0.455290937577505
(11/25/20 23:39:04) Time: 1 	Coherence: 0.45699157534453966
(11/25/20 23:39:29) Time: 2 	Coherence: 0.4566204695448083
(11/25/20 23:39:53) Time: 3 	Coherence: 0.4585021199301161
(11/25/20 23:40:19) Time: 4 	Coherence: 0.4594913403881529
(11/25/20 23:40:43) Time: 5 	Coherence: 0.45855463628750914
(11/25/20 23:41:10) Time: 6 	Coherence: 0.45878715438307954
(11/25/20 23:41:35) Time: 7 	Coherence: 0.460738913714581
(11/25/20 23:42:01) Time: 8 	Coherence: 0.4605638540201439
(11/25/20 23:42:28) Time: 9 	Coherence: 0.4580258129734675
(11/25/20 23:42:53) Time: 10 	Coherence: 0.45723622974839395
(11/25/20 23:43:30) Time: 11 	Coherence: 0.46090338465891073
(11/25/20 23:43:59) Time: 12 	Coherence: 0.45726855047216564
(11/25/20 23:44:24) Time: 13 	Coherence: 0.4570522145251227
(11/25/20 23:44:51) Time: 14 	Coherence: 0.4555516033455148
(11/25/20 23:45:18) Time: 15 	Coherence: 0.4574808357743209
(11/25/20 23:45:45) Time: 16 	Coherence: 0.455

In [24]:
df_held_out

  and should_run_async(code)


Unnamed: 0,index,created_at,id_str,full_text,user.id_str,user.followers_count,user.screen_name,user.verified,retweet_count,favorite_count,neg,neu,pos,compound
0,35,2020-03-22 04:01:55.000000,1241575812437962752,The new normal 💀 #CoronavirusPandemic #COVIDー1...,2159787812,245,chanel_nips,0,0,1,0.000,1.000,0.000,0.0000
1,299,2020-03-22 04:05:17.000000,1241576660471857153,@SeahawksBMX @Lrihendry @RyanAFournier @realDo...,729329119742283776,859,lilmotza,0,0,0,0.224,0.776,0.000,-0.6369
2,488,2020-03-22 04:07:02.000000,1241577100706013185,Whoooo boy.\n\n@TKeck44 check this out https:/...,202676583,1136,tathompsonKS,0,1,3,0.000,1.000,0.000,0.0000
3,734,2020-03-22 04:10:06.000000,1241577871916896257,march 23 :((( https://t.co/x0N6OPCNKJ,1238077439462572033,220,buttholechakra,0,0,0,0.000,1.000,0.000,0.0000
4,758,2020-03-22 04:10:24.000000,1241577947640877057,The Justice Department reportedly wants new em...,32792712,22267,EndGameWW3,0,12,14,0.318,0.569,0.114,-0.7269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241166,132892,2020-11-17 23:55:00.000000,1328849147307454465,"CASES SURGING: There are now a total of 3,143 ...",33988019,13673,23WIFR,1,1,0,0.000,0.859,0.141,0.5574
241167,133186,2020-11-17 23:56:52.000000,1328849619145482240,Dumbass. Can I say that? https://t.co/RUS3JkXKnE,48204988,2354,NatalieCGordon,0,0,34,0.474,0.526,0.000,-0.5574
241168,133489,2020-11-17 23:58:53.000000,1328850125347807232,This is just another reason why we ❤️❤️❤️ Doll...,36308751,844,nekesastraker,0,0,2,0.000,1.000,0.000,0.0000
241169,133531,2020-11-17 23:59:03.000000,1328850167517249536,This is fake news. We already know that's not ...,616797042,511,ZroHour,0,0,0,0.242,0.758,0.000,-0.6632


In [31]:
time_slices

  and should_run_async(code)


[726,
 1030,
 1120,
 1073,
 1133,
 1203,
 930,
 930,
 1046,
 1131,
 1264,
 1234,
 1259,
 1087,
 1077,
 1269,
 1313,
 1313,
 1170,
 1298,
 1064,
 1059,
 1285,
 1378,
 1296,
 1443,
 1298,
 1078,
 1151,
 1262,
 1421,
 1503,
 1509,
 1384,
 1151,
 1207,
 1366,
 1500,
 1462,
 1384,
 1316,
 809,
 1153,
 1341,
 1370,
 1451,
 1554,
 1449,
 1185,
 1115,
 1314,
 1365,
 1255,
 1490,
 1475,
 1266,
 1059,
 1284,
 1595,
 1424,
 1443,
 1296,
 1119,
 1048,
 1054,
 1233,
 1196,
 1227,
 1132,
 806,
 747,
 913,
 909,
 910,
 899,
 872,
 715,
 624,
 976,
 925,
 972,
 1147,
 1030,
 717,
 724,
 1051,
 1190,
 1114,
 1148,
 1154,
 892,
 812,
 1008,
 1218,
 1174,
 1403,
 1329,
 1090,
 956,
 1207,
 1137,
 1308,
 1394,
 1276,
 940,
 800,
 1106,
 1322,
 1390,
 1246,
 1169,
 919,
 918,
 1298,
 1357,
 1292,
 1408,
 1431,
 925,
 920,
 1158,
 1157,
 1255,
 1166,
 1086,
 826,
 690,
 1079,
 1289,
 1330,
 1294,
 1187,
 884,
 801,
 1137,
 1093,
 1070,
 1144,
 1030,
 741,
 714,
 917,
 1087,
 844,
 1014,
 923,
 594,
 524,
 8