Oct 30, 2019

Feasability test for Sandy Hook in /r/politics. 

In [7]:
from tqdm import tqdm
import os
os.chdir('../../')
from convokit import Corpus, User, Utterance

In [10]:
corpus = Corpus(filename='/Users/calebchiam/Documents/politics-filtered-corpus') #1.96 GB, don't duplicate.

In [12]:
corpus.print_summary_stats()

Number of Users: 289890
Number of Utterances: 3237456
Number of Conversations: 94768


In [13]:
import nltk

In [29]:
shooting_timestamps = {
 'Fort Hood shooting': (1257397200, 1258088400),
 'Binghamton shootings': (1238731200, 1239422400),
 'Geneva County massacre': (1236657600, 1237348800),
 'Sandy Hook Elementary School shooting': (1355461200, 1356152400),
 'Aurora theater shooting': (1342756800, 1343448000),
 'Washington Navy Yard shooting': (1379304000, 1379995200),
 'San Bernardino attack': (1449032400, 1449723600),
 'Orlando nightclub shooting': (1465704000, 1466395200),
 'Las Vegas shooting': (1506830400, 1507521600),
 'Sutherland Springs church shooting': (1509854400, 1510545600),
 'Stoneman Douglas High School shooting': (1518584400, 1519275600),
 'Santa Fe High School shooting': (1526616000, 1527307200)}

# 'Virginia Tech shooting': (1176696000, 1177387200),
#  'Thousand Oaks shooting': (1541566800, 1542258000),
# 'Pittsburgh synagogue shooting': (1540612800, 1541304000)

In [32]:
def identify_timestamp_category(timestamp):
    for k, v in shooting_timestamps.items():
        if v[0] <= timestamp <= v[1]:
            return k

## Label conversations and utterances by event time category

In [33]:
for convo in corpus.iter_conversations():
    convo.meta['event'] = identify_timestamp_category(convo.meta['timestamp'])
    for utt in convo.iter_utterances():
        utt.meta['event'] = convo.meta['event']

## Tokenize titles with lemmatization

In [26]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

In [21]:
convo = next(corpus.iter_conversations())

In [27]:
def tokenize_and_stem(sentence):
    return set(stemmer.stem(word) for word in nltk.word_tokenize(convo.meta['title']))

In [28]:
for convo in corpus.iter_conversations():
    convo.meta['stem_tokens'] = tokenize_and_stem(convo.meta['title'])

In [58]:
generic_words = {stemmer.stem(word) for word in ['shooting', 'shot', 'deaths', 'died', 'injured', 'guns', 'killing', 'attack', 'massacre', 'victim']}

## Generate specific words dictionaries for each event

In [74]:
specific_words_raw = {
    'Fort Hood shooting': ['fort hood', "nidal hasan"],
    'Binghamton shootings': ['binghamton', "jiverly", 'antares', 'wong'],
    'Geneva County massacre': ['geneva', "county", 'mclendon'],
 'Sandy Hook Elementary School shooting': ['sandy hook', "adam lanza"],
 'Aurora theater shooting': ['aurora theater', 'james eagan holmes', 'colorado'],
 'Washington Navy Yard shooting': ['washington navy yard', 'aaron alexis', 'navy yard'],
 'San Bernardino attack': ['syed rizwan farook', 'tashfeen malik', 'san bernandino', 'san bernandino'],
 'Orlando nightclub shooting': ['omar mateen'],
 'Las Vegas shooting': ['las vegas shooting', 'stephen paddock'],
 'Sutherland Springs church shooting': ['sutherland springs', 'sutherland church', 'devin patrick kelley'],
 'Stoneman Douglas High School shooting': ['Marjory Stoneman Douglas High School', 'Parkland', 'Florida'],
 'Santa Fe High School shooting': ['santa fe high school', 'dimitrios pagourtzis']}


In [60]:
specific_words_list = {}
for k, v in specific_words_raw.items():
    specific_words_list[k] = []
    for phrase in v + [k]:
        specific_words_list[k].extend(phrase.split())
    specific_words_list[k] = set([stemmer.stem(w) for w in set(specific_words_list[k])])
    specific_words_list[k] -= generic_words

## Label conversations and utterances with whether they are *actually* associated with the event

In [76]:
for convo in corpus.iter_conversations():
    event = convo.meta['event']
    tokens = convo.meta['stem_tokens']
    
    if len(tokens.intersection(generic_words)) > 0 or len(tokens.intersection(specific_words_list.get(event, {}))) > 0:
        convo.meta['valid'] = True
    else:
        convo.meta['valid'] = False
        
    for utt in convo.iter_utterances():
        utt.meta['valid'] = convo.meta['valid']

## Let's see a distribution of the counts

In [80]:
from collections import defaultdict
convo_counts = defaultdict(int)
utt_counts = defaultdict(int)
for convo in corpus.iter_conversations():
    if convo.meta['valid']: convo_counts[convo.meta['event']] += 1

for utt in corpus.iter_utterances():
    if utt.meta['valid']: utt_counts[utt.meta['event']] += 1
    

In [81]:
convo_counts

defaultdict(int,
            {'San Bernardino attack': 887,
             'Orlando nightclub shooting': 1770,
             'Sandy Hook Elementary School shooting': 3167,
             'Stoneman Douglas High School shooting': 2674,
             'Las Vegas shooting': 1729,
             'Sutherland Springs church shooting': 811,
             'Geneva County massacre': 181,
             'Washington Navy Yard shooting': 483,
             'Fort Hood shooting': 513,
             'Aurora theater shooting': 1461,
             None: 1055,
             'Binghamton shootings': 186,
             'Santa Fe High School shooting': 808})

In [82]:
utt_counts

defaultdict(int,
            {'San Bernardino attack': 42854,
             'Orlando nightclub shooting': 87024,
             'Sandy Hook Elementary School shooting': 84810,
             'Stoneman Douglas High School shooting': 218585,
             'Las Vegas shooting': 97896,
             'Sutherland Springs church shooting': 48353,
             'Geneva County massacre': 1523,
             'Washington Navy Yard shooting': 15286,
             'Fort Hood shooting': 6154,
             'Aurora theater shooting': 44642,
             None: 37173,
             'Binghamton shootings': 2047,
             'Santa Fe High School shooting': 80168})

In [83]:
corpus.dump("politics-filtered-labelled", base_path="/Users/calebchiam/Documents")