In [1]:
import os
os.chdir('../../..')

In [2]:
from convokit import Corpus, LanguageModel, download

In [4]:
forum_corpus = Corpus(filename="/Users/calebchiam/Documents/GitHub/cs6742-fork/datasets/gun_debate_forum_corpus_full")

In [5]:
forum_corpus.print_summary_stats()

Number of Users: 2010
Number of Utterances: 457973
Number of Conversations: 4104


In [6]:
next(forum_corpus.iter_conversations())

Conversation({'_owner': <convokit.model.corpus.Corpus object at 0x12b0ff588>, '_id': 1, '_utterance_ids': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52], '_usernames': None, '_meta': {}})

In [7]:
for convo in forum_corpus.iter_conversations():
    convo._utterance_ids = [str(i) for i in convo._utterance_ids]
    convo._id = str(convo.id)

In [8]:
from cleantext import clean

clean_str = lambda s: clean(s,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=False,                 # fully remove punctuation
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                    
)

In [11]:
for utt in forum_corpus.iter_utterances():
    utt.id = str(utt.id)
    utt.text = clean_str(utt.text)
    utt.root = str(utt.root)

In [12]:
for user in forum_corpus.iter_users():
    if 'lean' in user.meta:
        user.meta['lean'] = user.meta['lean'].strip()
    else:
        user.meta['lean'] = None

In [13]:
forum_corpus.dump("gun_debate_forum_corpus_fixed", base_path="/Users/calebchiam/Documents/GitHub/cs6742-fork/datasets")

## Load the fixed corpus

In [14]:
forum_corpus = Corpus(filename="/Users/calebchiam/Documents/GitHub/cs6742-fork/datasets/gun_debate_forum_corpus_fixed")

In [15]:
forum_corpus.print_summary_stats()

Number of Users: 2010
Number of Utterances: 457973
Number of Conversations: 4104


In [16]:
for convo in forum_corpus.iter_conversations():
    year = '20' + convo.meta['posted_date'].split('-')[-1]
    if year == '20Yesterday':
        year = '2019'
    for utt in convo.iter_utterances():
        utt.meta['year'] = year

In [17]:
from collections import defaultdict
leanings = defaultdict(int)
for user in forum_corpus.iter_users():
    if 'lean' in user.meta:
        leanings[user.meta['lean']] += 1 

In [18]:
leanings

defaultdict(int,
            {'Libertarian': 117,
             'Slightly Conservative': 49,
             'Progressive': 77,
             'Undisclosed': 428,
             'Liberal': 105,
             'Libertarian - Right': 83,
             'Conservative': 215,
             'Very Conservative': 87,
             'Slightly Liberal': 56,
             'Independent': 240,
             'Centrist': 76,
             'Other': 182,
             'Socialist': 50,
             'Libertarian - Left': 39,
             None: 61,
             'Private': 19,
             'Moderate': 73,
             'Very Liberal': 44,
             'Communist': 9})

In [19]:
# Liberal, Very Liberal, Progressive
# Libertarian - Right, Conservative, Very Conservative
# Independent, Centrist, Moderate

In [20]:
from collections import defaultdict
year_to_utts = defaultdict(list)
for utt in forum_corpus.iter_utterances():
    year_to_utts[utt.meta['year']].append(utt)

In [21]:
for year, utts in year_to_utts.items():
    print((year, len(utts)))

('2019', 54714)
('2018', 72113)
('2017', 54629)
('2015', 50248)
('2014', 52090)
('2013', 103768)
('2016', 49123)
('2012', 21288)


In [22]:
# for the set of liberal users
# sample up to 3 utts per user per year
# sum the utterances from all users for each year
# use the minimum - some, to get a sample for every year

In [23]:
antigun_users = []
progun_users = []
for user in forum_corpus.iter_users():
    lean = user.meta.get('lean', None)
    if lean in {'Liberal', 'Very Liberal', 'Progressive'}:
        antigun_users.append(user)
    elif lean in {'Libertarian - Right', 'Conservative', 'Very Conservative'}:
        progun_users.append(user)

In [24]:
from random import sample

In [25]:
for user in antigun_users + progun_users:
    user.meta['year_utts'] = defaultdict(list)
    for utt in user.iter_utterances():
        user.meta['year_utts'][utt.meta['year']].append(utt)
    
    for year in user.meta['year_utts']:
        if len(user.meta['year_utts'][year]) > 3: 
            user.meta['year_utts'][year] = sample(user.meta['year_utts'][year], 3)

In [26]:
antigun_year_utts = defaultdict(list)
progun_year_utts = defaultdict(list)

In [27]:
for user in antigun_users:
    for year, utts in user.meta['year_utts'].items():
        antigun_year_utts[year].extend(utts)

In [28]:
for year, utts in antigun_year_utts.items():
    print((year, len(utts)))

('2019', 177)
('2016', 159)
('2015', 163)
('2014', 166)
('2013', 237)
('2012', 129)
('2018', 189)
('2017', 143)


In [29]:
## Minimum is 129 utts for antigun, so take 100

In [30]:
for user in progun_users:
    for year, utts in user.meta['year_utts'].items():
        progun_year_utts[year].extend(utts)

In [31]:
for year, utts in progun_year_utts.items():
    print((year, len(utts)))

('2019', 240)
('2018', 319)
('2017', 276)
('2015', 261)
('2014', 299)
('2013', 392)
('2016', 278)
('2012', 225)


In [None]:
# minimum is 225 for progun, so take 200

In [35]:
antigun_sampled_utts = []
for year, utts in antigun_year_utts.items():
    antigun_sampled_utts.extend(sample(utts, 100))
print(len(antigun_sampled_utts))

800


In [37]:
progun_sampled_utts = []
for year, utts in progun_year_utts.items():
    progun_sampled_utts.extend(sample(utts, 200))
print(len(progun_sampled_utts))

1600


In [40]:
from nltk import sent_tokenize

In [41]:
with open('progun_forum_corpus_full.txt', 'w') as f:
    for utt in progun_sampled_utts:
        for sentence in sent_tokenize(utt.text):
            if len(sentence) > 5:
                f.write(sentence)
                f.write("\n")

In [42]:
with open('antigun_forum_corpus_full.txt', 'w') as f:
    for utt in progun_sampled_utts:
        for sentence in sent_tokenize(utt.text):
            if len(sentence) > 5:
                f.write(sentence)
                f.write("\n")

## whole corpus training

In [65]:
for user in forum_corpus.iter_users():
    user.meta['year_utts'] = defaultdict(list)
    for utt in user.iter_utterances():
        user.meta['year_utts'][utt.meta['year']].append(utt)
    
    for year in user.meta['year_utts']:
        if len(user.meta['year_utts'][year]) > 3: 
            user.meta['year_utts'][year] = sample(user.meta['year_utts'][year], 3)

In [66]:
year_utts = defaultdict(list)

In [67]:
for user in forum_corpus.iter_users():
    for year, utts in user.meta['year_utts'].items():
        year_utts[year].extend(utts)

In [69]:
for year, utts in year_utts.items():
    print((year, len(utts)))

('2019', 1392)
('2017', 1340)
('2018', 1660)
('2015', 1437)
('2016', 1369)
('2013', 2112)
('2014', 1464)
('2012', 1146)


In [70]:
sampled_utts = []
for year, utts in year_utts.items():
    sampled_utts.extend(sample(utts, 1000))

In [72]:
with open('whole_forum_corpus_full.txt', 'w') as f:
    for utt in sampled_utts:
        for sentence in sent_tokenize(utt.text):
            if len(sentence) > 5:
                f.write(sentence)
                f.write("\n")

## Distribution of utterances

In [None]:
import matplotlib.pyplot as plt

In [None]:
num_utts_antigun_users = [len(list(user.iter_utterances())) for user in antigun_users]
num_utts_progun_users = [len(list(user.iter_utterances())) for user in progun_users]
num_utts_moderate_users = [len(list(user.iter_utterances())) for user in moderate_users]

In [None]:
plt.hist(num_utts_antigun_users, bins=20)

Some people say a LOT.

In [None]:
plt.hist(num_utts_progun_users, bins=20) # holy shit that guy at 50000

In [None]:
plt.hist(num_utts_moderate_users, bins=20) # holy shit that guy at 7500

In [None]:
import numpy as np

In [None]:
print(np.median(num_utts_antigun_users))
print(np.mean(num_utts_antigun_users))

In [None]:
print(np.median(num_utts_progun_users))
print(np.mean(num_utts_progun_users))

To normalize, we take up to 5 utterances per user. If the user has <= 5 utterances, take all of them.

In [None]:
from random import sample

In [None]:
progun_utts = []
for user in progun_users:
    user_utts = list(user.iter_utterances())
    if len(user_utts) <= 5:
        progun_utts.extend(user_utts)
    else:
        progun_utts.extend(sample(user_utts, 5))

In [None]:
antigun_utts = []
for user in antigun_users:
    user_utts = list(user.iter_utterances())
    if len(user_utts) <= 5:
        antigun_utts.extend(user_utts)
    else:
        antigun_utts.extend(sample(user_utts, 5))

In [None]:
moderate_utts = []
for user in moderate_users:
    user_utts = list(user.iter_utterances())
    if len(user_utts) <= 5:
        moderate_utts.extend(user_utts)
    else:
        moderate_utts.extend(sample(user_utts, 5))

In [None]:
len(antigun_utts)

In [None]:
len(progun_utts)

In [None]:
len(moderate_utts)

In [None]:
[utt.text for utt in antigun_utts[:10]]

In [None]:
[utt.text for utt in progun_utts[:10]]

In [None]:
def clean_text(txt):
    return txt.replace("\n", " ").replace('\x92', "'")

In [None]:
progun_text = ''
for utt in progun_utts:
    if len(utt.text) >= 25:
        if utt.text.endswith('.'):
            progun_text += clean_text(utt.text) + ' '
        else:
            progun_text += clean_text(utt.text) + '. '

In [None]:
antigun_text = ''
for utt in antigun_utts:
    if len(utt.text) >= 25:
        if utt.text.endswith('.'):
            antigun_text += clean_text(utt.text) + ' '
        else:
            antigun_text += clean_text(utt.text) + '. '

In [None]:
moderate_text = ''
for utt in moderate_utts:
    if len(utt.text) >= 25:
        if utt.text.endswith('.'):
            moderate_text += clean_text(utt.text) + ' '
        else:
            moderate_text += clean_text(utt.text) + '. '

In [None]:
from random import sample
all_text = ''
for user in forum_corpus.iter_users():
    user_utts = list(user.iter_utterances())
    user_utts = [utt for utt in user_utts if len(utt.text) >= 25]
    if len(user_utts) >= 3:
        selected_utts = sample(user_utts, 3)
    else:
        selected_utts = user_utts
    
    for utt in selected_utts:
        if utt.text.endswith('.'):
            all_text += clean_text(utt.text) + ' '
        else:
            all_text += clean_text(utt.text) + '. '

In [None]:
len(progun_text)

In [None]:
len(antigun_text)

In [None]:
len(moderate_text)

In [None]:
from nltk import sent_tokenize

In [None]:
with open('progun_forum_corpus_full.txt', 'w') as f:
    for sentence in sent_tokenize(progun_text):
        if len(sentence) > 5:
            f.write(sentence)
            f.write("\n")

In [None]:
with open('antigun_forum_corpus_full.txt', 'w') as f:
    for sentence in sent_tokenize(antigun_text):
        if len(sentence) > 5:
            f.write(sentence)
            f.write("\n")

In [None]:
with open('moderate_forum_corpus_full.txt', 'w') as f:
    for sentence in sent_tokenize(moderate_text):
        if len(sentence) > 5:
            f.write(sentence)
            f.write("\n")

In [None]:
with open('whole_forum_corpus_full.txt', 'w') as f:
    for sentence in sent_tokenize(all_text):
        if len(sentence) > 5:
            f.write(sentence)
            f.write("\n")

## Training step

In [5]:
lm = LanguageModel(SRILM_path='/Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/srilm-1.7.3',
                  working_dir='/Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/dump/',
                  lm_output_path='whole_forum_full_new.lm',
                  lm_type='laplace',
                  count_output_path='whole_forum_counts.txt',
                  order=2,
                  verbose=True)

In [6]:
lm.train('whole_forum_corpus_full.txt')




In [7]:
lm = LanguageModel(SRILM_path='/Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/srilm-1.7.3',
                  working_dir='/Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/dump/',
                  lm_output_path='progun_forum_full_new.lm',
                  lm_type='laplace',
                  count_output_path='progun_forum_counts.txt',
                  order=2,
                  verbose=True)

In [8]:
lm.train('progun_forum_corpus_full.txt')




In [12]:
lm.str_perplexity("i love guns and freedom.")

file /Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/dump/temp/1961692460021076566.txt: 1 sentences, 5 words, 0 OOVs
0 zeroprobs, logprob= -18.79049 ppl= 1354.406 ppl1= 5729.26


1354.406

In [10]:
lm.str_perplexity("i love muffins and xlotl.")

file /Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/dump/temp/7130116306463723547.txt: 1 sentences, 5 words, 0 OOVs
0 zeroprobs, logprob= -17.89655 ppl= 961.0782 ppl1= 3795.863


961.0782

In [60]:
lm.str_perplexity("i hate guns.")

file /Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/dump/temp/3495910162948954265.txt: 1 sentences, 3 words, 0 OOVs
0 zeroprobs, logprob= -10.92274 ppl= 537.8788 ppl1= 4374.341


537.8788

In [63]:
lm.str_perplexity("we ought to have a right to arm ourselves")

file /Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/dump/temp/4482285361943563228.txt: 1 sentences, 9 words, 0 OOVs
0 zeroprobs, logprob= -28.12978 ppl= 650.0974 ppl1= 1335.155


650.0974

In [64]:
lm.str_perplexity("we ought not have a right to arm ourselves")

file /Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/dump/temp/6963016778896762089.txt: 1 sentences, 9 words, 0 OOVs
0 zeroprobs, logprob= -26.91023 ppl= 490.9343 ppl1= 977.2957


490.9343

In [14]:
lm2 = LanguageModel(SRILM_path='/Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/srilm-1.7.3',
                  working_dir='/Users/calebchiam/Documents/GitHub/cs6742-fork/convokit/SRILM/dump/',
                  lm_output_path='antigun_forum_full_new.lm',
                  lm_type='laplace',
                  count_output_path='antigun_forum_counts.txt',
                  order=2,
                  verbose=False)

In [15]:
lm2.train('antigun_forum_corpus_full.txt')




In [18]:
lm2.str_perplexity("i love guns.")

406.6405

In [19]:
lm2.str_perplexity("i hate guns.")

537.8788

In [None]:
lm2.str_perplexity("We ought to have a right to arm ourselves.")

In [None]:
lm2.str_perplexity("We ought not have a right to arm ourselves.")