Oct 30, 2019

Feasability test for Sandy Hook in /r/politics. 

In [8]:
from tqdm import tqdm
import os
os.chdir('../../')
from convokit import Corpus, User, Utterance
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
from collections import defaultdict



In [None]:

shooting_timestamps = {
 'Geneva County massacre': (1236657600, 1237953600),
 'Binghamton shootings': (1238731200, 1240027200),
 'Fort Hood shooting': (1257397200, 1258693200),
 'Aurora theater shooting': (1342756800, 1344052800),
 'Sandy Hook Elementary School shooting': (1355461200, 1356757200),
 'Washington Navy Yard shooting': (1379304000, 1380600000),
 'San Bernardino attack': (1449032400, 1450328400),
 'Orlando nightclub shooting': (1465704000, 1467000000),
 'Las Vegas shooting': (1506830400, 1508126400),
 'Sutherland Springs church shooting': (1509854400, 1511150400),
 'Stoneman Douglas High School shooting': (1518584400, 1519880400),
 'Santa Fe High School shooting': (1526616000, 1527912000)}

# 'Virginia Tech shooting': (1176696000, 1177387200),
#  'Thousand Oaks shooting': (1541566800, 1542258000),
# 'Pittsburgh synagogue shooting': (1540612800, 1541304000)

In [None]:
def identify_timestamp_category(timestamp):
    for k, v in shooting_timestamps.items():
        if v[0] <= timestamp <= v[1]:
            return k

In [4]:
subreddits = ['news', 'politics', 'worldnews', 'Liberal', 'progressive', 'democrats', 'Conservative', 'The_Donald', 'Republican']

In [None]:
def tokenize_and_stem(sentence):
    return set(stemmer.stem(word) for word in nltk.word_tokenize(sentence))

## Generate specific words dictionaries for each event

In [None]:
generic_words = {stemmer.stem(word) for word in ['shooting', 'survivor', 'wounded', 'shot', 'deaths', 
                                                 'died', 'injured', 'guns', 'killing', 'attack', 
                                                 'massacre', 'victim']}

In [None]:
specific_words_raw = {
    'Fort Hood shooting': ["nidal hasan", "killeen", "texas"],
    'Binghamton shootings': ["jiverly antares wong"],
    'Geneva County massacre': ['michael kenneth mclendon', 'kinston', 'samson', 'alabama'],
     'Sandy Hook Elementary School shooting': ["adam lanza", "newton", "connecticut"],
     'Aurora theater shooting': ['james eagan holmes', 'colorado'],
     'Washington Navy Yard shooting': ['aaron alexis'],
     'San Bernardino attack': ['syed rizwan farook', 'tashfeen malik'],
     'Orlando nightclub shooting': ['omar mateen'],
     'Las Vegas shooting': ['stephen paddock'],
     'Sutherland Springs church shooting': ['devin patrick kelley', "baptist", "texas"],
     'Stoneman Douglas High School shooting': ['Marjory', 'Parkland', 'Florida'],
     'Santa Fe High School shooting': ['dimitrios pagourtzis', "texas"]
}

In [None]:
specific_words_list = {}
for k, v in specific_words_raw.items():
    specific_words_list[k] = []
    for phrase in v + [k]:
        specific_words_list[k].extend(phrase.split())
    specific_words_list[k] = set([stemmer.stem(w) for w in set(specific_words_list[k])])
    specific_words_list[k] -= generic_words

In [None]:
def get_utt_convo_counts(corpus):
    convo_counts = defaultdict(int)
utt_counts = defaultdict(int)
for convo in corpus.iter_conversations():
    if convo.meta['valid']: convo_counts[convo.meta['event']] += 1

for utt in corpus.iter_utterances():
    if utt.meta['valid']: utt_counts[utt.meta['event']] += 1
    

In [None]:
for subreddit in subreddits:
    corpus = Corpus(filename='/Users/calebchiam/Documents/{}-filtered-corpus'.format(subreddit))
    
    # Label conversations and utterances by event time category
    for convo in corpus.iter_conversations():
    convo.meta['event'] = identify_timestamp_category(convo.meta['timestamp'])
    for utt in convo.iter_utterances():
        utt.meta['event'] = convo.meta['event']
        
    # Tokenize and stem titles
    for convo in corpus.iter_conversations():
        convo.meta['stem_tokens'] = tokenize_and_stem(convo.meta['title'])
        
    # Label conversations and utterances with whether they are actually associated with the event
    
    for convo in corpus.iter_conversations():
    event = convo.meta['event']
    tokens = convo.meta['stem_tokens']
    
    if len(tokens.intersection(generic_words)) > 0 or len(tokens.intersection(specific_words_list.get(event, {}))) > 0:
        convo.meta['valid'] = True
    else:
        convo.meta['valid'] = False
        
    for utt in convo.iter_utterances():
        utt.meta['valid'] = convo.meta['valid']
    

## Let's see a distribution of the counts

In [24]:
convo_counts

defaultdict(int,
            {'San Bernardino attack': 887,
             'Orlando nightclub shooting': 1770,
             'Sandy Hook Elementary School shooting': 3167,
             'Stoneman Douglas High School shooting': 2674,
             'Las Vegas shooting': 1729,
             'Sutherland Springs church shooting': 811,
             'Geneva County massacre': 181,
             'Washington Navy Yard shooting': 483,
             'Fort Hood shooting': 513,
             'Aurora theater shooting': 1461,
             None: 1055,
             'Binghamton shootings': 186,
             'Santa Fe High School shooting': 808})

In [25]:
utt_counts

defaultdict(int,
            {'San Bernardino attack': 42854,
             'Orlando nightclub shooting': 87024,
             'Sandy Hook Elementary School shooting': 84810,
             'Stoneman Douglas High School shooting': 218585,
             'Las Vegas shooting': 97896,
             'Sutherland Springs church shooting': 48353,
             'Geneva County massacre': 1523,
             'Washington Navy Yard shooting': 15286,
             'Fort Hood shooting': 6154,
             'Aurora theater shooting': 44642,
             None: 37173,
             'Binghamton shootings': 2047,
             'Santa Fe High School shooting': 80168})

In [None]:
from tqdm import tqdm

In [None]:
for utt in tqdm(list(corpus.iter_utterances())):
    if "stem_tokens" in utt.meta: continue
    if utt.meta['valid']:
        utt.meta['stem_tokens'] = tokenize_and_stem(utt.text)
    else:
        utt.meta['stem_tokens'] = None

In [None]:
corpus.dump("politics-filtered-labelled", base_path="/Users/calebchiam/Documents")

In [3]:
corpus = Corpus(filename='/Users/calebchiam/Documents/politics-filtered-labelled')