In [1]:
#All these packages need to be installed from pip
import gensim	#For word2vec, etc
import requests #For downloading our datasets
import lucem_illud_2020

import numpy as np #For arrays
import pandas as pd #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import seaborn #Makes the graphics look nicer
import sklearn.metrics.pairwise #For cosine similarity
import sklearn.manifold #For T-SNE
import sklearn.decomposition #For PCA
import spacy
import copy
import nltk

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import os #For looking through files
import os.path #For managing file paths
import pickle
import dill

nlp = spacy.load('en_core_web_lg')

In [2]:
with open('donald_full.pkl', 'rb') as f:
    donald_full = pickle.load(f)

In [6]:
# Reading in the Submission files
donald_sub = pd.read_csv('../project_data/sub_The_Donald-2015-06-16-2018-11-08.csv')

# Adding the pre-fix for submissions. Source: https://www.reddit.com/dev/api/
donald_sub['id'] = "t3_" + donald_sub['id']
donald_sub.set_index('id', inplace = True)

# Combining title & body of submission  
donald_sub['body'] = donald_sub['title'].fillna('') + ' ' + donald_sub['selftext'].fillna('')
# donald_sub.head()

# Reading in the Comment files
# donald_comm = pd.read_csv('../project_data/comm_The_Donald-2015-06-16-2018-11-08.csv')

# # Adding the prefix for comments. Source: https://www.reddit.com/dev/api/
# donald_comm['id'] = "t1_" + donald_comm['id']
# donald_comm.set_index('id', inplace = True)
# # donald_comm.head()

# # Appending the two sets of files
# donald_full = donald_sub.append(donald_comm)
# # donald_full.head()

In [7]:
import math
nweeks = 6
tstamp_win = nweeks*7*24*60*60
nbins = math.ceil((max(donald_sub['created_utc']) - (min(donald_sub['created_utc'])-1))/tstamp_win)
bin_list = range((min(donald_sub['created_utc'])-1), min(donald_sub['created_utc'])-1+((nbins+1)*tstamp_win), tstamp_win)
donald_sub['created_tranche'] = pd.cut(donald_sub['created_utc'], bin_list, labels = list(range(nbins)))
donald_sub['created_tranche'].value_counts(sort=False)

0        14
1        28
2       178
3       875
4      6826
5     12380
6     10172
7      7919
8     10802
9     11673
10    14139
11    13947
12    13896
13    14000
14    13907
15    14093
16    13938
17    13987
18    14007
19    14000
20    14000
21    14000
22    13674
23    13957
24    14048
25    13974
26    14003
27    14000
28     3000
Name: created_tranche, dtype: int64

In [8]:
from spacy.attrs import LIKE_URL

# Add custom infix for '[', ']' to deal with markdown-style hyperlinks

infixes = list(nlp.Defaults.infixes)
infixes.extend(["\\[","\\]"])
infixes

infix_regex = spacy.util.compile_infix_regex(infixes)

# infix_regex

# nlp.tokenizer.

nlp.tokenizer.infix_finditer = infix_regex.finditer

# Customized word_tokenize function from lucem_illud to remove URL tokens, and save them in new column
def word_tokenize_cust(word_list, model=nlp, MAX_LEN=1500000):
    
    tokenized = []
    url_list = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner"])
    
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            if token.like_url:
                url_list.append(token.text)
            else:
                tokenized.append(token.text)
    
    return tokenized

In [9]:
donald_sub['normalized_sents'] = donald_sub['body'].progress_apply(lambda x: [lucem_illud_2020.normalizeTokens(word_tokenize_cust(s)) for s in lucem_illud_2020.sent_tokenize(str(x))])

with open('donald_sub.pkl', 'wb') as fout:
    dill.dump(donald_sub, fout)

HBox(children=(FloatProgress(value=0.0, max=315437.0), HTML(value='')))




In [68]:
from nltk.lm.preprocessing import padded_everygram_pipeline
import itertools 

sub_ngram_dict3 = {}

def snapModelsVec3(groupdf, ngram_dict, sort = True):
    n = 2
        
    train_data, padded_sents = padded_everygram_pipeline(n, groupdf['normalized_sents'].sum())
    model = nltk.lm.models.KneserNeyInterpolated(order = n)
    model.fit(train_data, padded_sents)
    print("Model Vocab size: {}".format(len(model.vocab)), end = '\n')
    model_name = 'sub_ngram_' + str(n) + '_tranche_' + str(groupdf['created_tranche'][0]) + '_KNModel.pkl'
    with open(model_name, 'wb') as fout:
        dill.dump(model, fout)
    ngram_dict[(groupdf['created_tranche'][0], n)] = model

In [69]:
donald_sub.groupby('created_tranche').progress_apply(snapModelsVec3, ngram_dict = sub_ngram_dict3)

with open('sub_2gram_model_dict.pkl', 'wb') as fout:
    dill.dump(sub_ngram_dict3, fout)

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))

Model Vocab size: 342
Model Vocab size: 310
Model Vocab size: 1812
Model Vocab size: 4301
Model Vocab size: 13229
Model Vocab size: 15918
Model Vocab size: 13030
Model Vocab size: 13986
Model Vocab size: 16249
Model Vocab size: 15802
Model Vocab size: 22331
Model Vocab size: 19207
Model Vocab size: 18604
Model Vocab size: 21218
Model Vocab size: 19275
Model Vocab size: 19132
Model Vocab size: 19992
Model Vocab size: 19789
Model Vocab size: 18985
Model Vocab size: 19230
Model Vocab size: 19034
Model Vocab size: 18745
Model Vocab size: 17760
Model Vocab size: 18584
Model Vocab size: 18099
Model Vocab size: 17005
Model Vocab size: 15699
Model Vocab size: 16337
Model Vocab size: 7105



In [169]:
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams, bigrams

def ce_calc(tranche_val, bigram_list):
    try:
        return sub_ngram_dict3[(tranche_val, 2)].entropy(bigram_list)
    except ZeroDivisionError:
        return float('nan')

In [53]:
donald_sub_select = donald_sub[(donald_sub['num_comments'] >= 10) & (donald_sub['score'] >= 100) & (donald_sub['created_tranche'] > 4)]
len(donald_sub_select)
# donald_sub[(donald_sub['num_comments'] >= 10) & (donald_sub['score'] >= 100)]['created_tranche'].value_counts(sort=False)

234574

In [99]:
donald_sub_select['created_tranche'].value_counts(sort=False)

0         0
1         0
2         0
3         0
4         0
5      8931
6      8509
7      6180
8      7930
9      9481
10    10382
11    10909
12    10956
13    11468
14    11063
15    11547
16    11849
17    11238
18    11443
19    11297
20    10972
21    10829
22    11369
23    11756
24    11326
25    11306
26    11331
27     2501
28        1
Name: created_tranche, dtype: int64

In [None]:
donald_sub_select['in_toks'] = donald_sub_select['normalized_sents'].progress_apply(lambda x: lucem_illud_2020.normalizeTokens(word_tokenize_cust(x))[:30])

In [176]:
donald_sub_select['ce_score'] = donald_sub_select.progress_apply(lambda x: ce_calc(x['created_tranche'], list(bigrams(x['in_toks']))), axis = 1)

In [None]:
with open('donald_sub_select.pkl', 'wb') as fout:
    dill.dump(donald_sub_select, fout)

## (Abandoned full-length SLM, with submissions and comments)

In [6]:
donald_full.groupby('created_tranche').progress_apply(snapModelsVec3)

with open('2gram_model_dict.pkl', 'wb') as fout:
    dill.dump(ngram_dict3, fout)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Model Vocab size: 738
Model Vocab size: 2811
Model Vocab size: 4012
Model Vocab size: 10566
Model Vocab size: 21558
Model Vocab size: 36548
Model Vocab size: 40361
Model Vocab size: 38636
Model Vocab size: 38824
Model Vocab size: 38875
Model Vocab size: 39239
Model Vocab size: 38188
Model Vocab size: 40464
Model Vocab size: 39463
Model Vocab size: 39567
Model Vocab size: 40882
Model Vocab size: 39375
Model Vocab size: 40443
Model Vocab size: 39901
Model Vocab size: 40265
Model Vocab size: 41206
Model Vocab size: 40816
Model Vocab size: 39905
Model Vocab size: 39601
Model Vocab size: 40111
Model Vocab size: 40309
Model Vocab size: 39356
Model Vocab size: 39283
Model Vocab size: 38002
Model Vocab size: 16403



![Calling quits after ~12 hours](heatdeath.png)
Ran an SLM for ~60 hours, before terminating it.

In [None]:
donald_full['ce_2gram'] = np.vectorize(ce_calc)(donald_full['created_tranche'], 2, donald_full['tok_select'])