In [2]:
import pandas as pd
import glob, os, sys
import xml.etree.ElementTree as ET
from tqdm import tqdm_notebook as tqdm
sys.path.append('../scripts/')
import util
import parsing_util
from importlib import reload
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import consensus_score
import json
from collections import defaultdict

# Parse text

In [251]:
data_dir = "../data/news-article-flatlist/"
stanford_dir = os.path.join(data_dir, 'stanford-parses')
text_dir = os.path.join(data_dir, 'sources-and-docs-for-tm')

stanford_parses = glob.glob(os.path.join(stanford_dir, '*'))

parsed_texts = []
for xml_file in tqdm(stanford_parses):
    t = parsing_util.parse_people_and_docs(
        xml_file,
        include_all_mentions=False,
        include_all_sentences_in_doc=False
    )
    if len(t['source_sentences']) > 0:
        doc_id = t['doc_id']
        parsed_texts.append(t)
        json.dump(t, open(os.path.join(text_dir, doc_id + '.json'), 'w'))

In [None]:
stanford_parses = glob.glob(os.path.join(stanford_dir, '*'))

parsed_texts = []
for xml_file in tqdm(stanford_parses):
    t = parsing_util.parse_people_and_docs(
        xml_file,
        include_all_mentions=False,
        include_all_sentences_in_doc=False
    )
    if len(t['source_sentences']) > 0:
        doc_id = t['doc_id']
        parsed_texts.append(t)
        json.dump(t, open(os.path.join(text_dir, doc_id + '.json'), 'w'))

# Vectorize text

In [4]:
text_jsons = glob.glob(os.path.join(text_dir, '*'))

In [5]:
parsed_texts = []
for text_json in text_jsons:
    text = json.load(open(text_json))
    parsed_texts.append(text)

In [14]:
pd.DataFrame(parsed_texts).head()

Unnamed: 0,doc_id,doc_sentences,source_sentences
0,1dab5d59-c916-11e9-a6c2-b831b5755f6c,lead : President Reagan 's new budget propose ...,{'David R. Obey': 'Representative David R. Obe...
1,1dab5d62-c916-11e9-ba39-b831b5755f6c,lead : John Cardinal O'Connor of New York toda...,{'John Cardinal O'Connor': 'lead : John Cardin...
2,1dab8413-c916-11e9-bcb3-b831b5755f6c,"lead : for four and a half hour Wednesday , Do...",{'Dominick P. Pannunzio': 'lead : for four and...
3,1dab8421-c916-11e9-8afa-b831b5755f6c,"lead : a winter storm , combine with unusually...","{'Clifford Crowley': 'Stay at home , if at all..."
4,1dace32f-c916-11e9-8d7a-b831b5755f6c,lead : four small polyp that appear to be nonc...,{'John Randolph Beahrs': 'small and ` Benign a...


# build count vecs

In [6]:
source_sentences = []
doc_sentences = []
for text in parsed_texts:
    for name, source_text in text['source_sentences'].items():
        source_sentences.append(source_text)
    doc_sentences.append(text['doc_sentences'])

doc_cv = CountVectorizer(min_df=.01, max_df=.5, stop_words='english')
source_cv = CountVectorizer(min_df=.001, max_df=.5, stop_words='english')
###
doc_cv.fit(doc_sentences)
source_cv.fit(source_sentences)
###
combined_vocab = pd.concat([
    pd.Series(source_cv.vocabulary_).reset_index()['index'],
    pd.Series(doc_cv.vocabulary_).reset_index()['index']
]).drop_duplicates().reset_index(drop=True).reset_index().set_index('index').iloc[:, 0].to_dict()

full_cv = CountVectorizer(vocabulary=combined_vocab)

In [244]:
doc_cv = CountVectorizer(min_df=.01, max_df=.5, stop_words='english')
source_cv = CountVectorizer(min_df=.001, max_df=.5, stop_words='english')
###
doc_cv.fit(doc_sentences)
source_cv.fit(source_sentences)
###
combined_vocab = pd.concat([
    pd.Series(source_cv.vocabulary_).reset_index()['index'],
    pd.Series(doc_cv.vocabulary_).reset_index()['index']
]).drop_duplicates().reset_index(drop=True).reset_index().set_index('index').iloc[:, 0].to_dict()

In [245]:
full_cv = CountVectorizer(vocabulary=combined_vocab)

# Transform sentences, write data output

In [311]:
os.listdir('.')

['.ipynb_checkpoints',
 '1dab8413-c916-11e9-bcb3-b831b5755f6c.txt.xml',
 '1dab8421-c916-11e9-8afa-b831b5755f6c.txt.xml',
 '2019-09-022__allennlp.ipynb',
 '2019-09-15__examine-sources.ipynb',
 '2019-09-21__explore-persona-model.html',
 '2019-09-21__explore-persona-model.ipynb',
 '2019-09-21__explore-persona-model.pdf',
 '2019-09-27__allen-nlp-on-sources.ipynb',
 '2019-09-27__explore-tags-in-nyt-corpus.ipynb',
 '2019-10-05__annotation-frames.ipynb',
 '2019-10-08__anonymous-sources.ipynb',
 '2019-10-08__yago.ipynb',
 '2019-10-12__biclustering.ipynb',
 '2019-10-18__topic-model.ipynb']

In [326]:
len(glob.glob('../data/news-article-flatlist/stanford-parses/3/*'))

2018

In [314]:
import shutil

In [246]:
def map_words(input_text, cutoff=None):
    output = []
    for idx, word in enumerate(input_text.split()):
        if word in full_cv.vocabulary:
            output.append(full_cv.vocabulary[word])
    return output[:cutoff]

text_output = []
for doc_num, text in enumerate(parsed_texts):
    doc_chunk = {}
    doc_id = text['doc_id']
    doc_chunk['doc_vec'] = map_words(text['doc_sentences'], cutoff=200)
    doc_chunk['doc_id'] = doc_id

    ## configure sources
    source_map = {}
    source_vecs = {}
    for source_num, (name, source_text) in enumerate(text['source_sentences'].items()):
        source_id = 'S_%s_%s' % (doc_num, source_num)
        source_map[source_id] = name
        source_vecs[source_id] = map_words(source_text, cutoff=100)    
    doc_chunk['source_map'] = source_map
    doc_chunk['source_vecs'] = source_vecs

    doc_chunk['source_labels'] = {}
    if doc_id in s_id2label:
        doc_chunk['source_labels'] = s_id2label[doc_id]
    
    text_output.append(doc_chunk)

sum(list(map(lambda x: len(x['source_map']), text_output)))

with open('../models/topic_model/input_data/doc_source.json', 'w') as f:
    for doc_chunk in text_output:           
        f.write(json.dumps(doc_chunk))
        f.write('\n')

with open('../models/topic_model/input_data/vocab.txt', 'w') as f:
    for word in pd.Series(full_cv.vocabulary).index:
        f.write(word)
        f.write('\n')

# Read in data

In [1]:
import json

In [6]:
input_json_strs = open('../models/topic_model/input_data/doc_source.json').read().split('\n')

In [9]:
input_jsons = []
for input_json in input_json_strs:
    if input_json:
        input_jsons.append(json.loads(input_json))

In [37]:
vocab = open('../models/topic_model/input_data/vocab.txt').read().split('\n')

# Topic Model

In [134]:
import sys

In [135]:
sys.path.append('../models/topic_model/')

In [136]:
import sampler
from importlib import reload

In [137]:
reload(sampler)

<module 'sampler' from '../models/topic_model\\sampler.py'>

In [33]:
gibbs = sampler.BOW_Source_GibbsSampler(docs=input_jsons, vocab=vocab)

In [34]:
gibbs.initialize()

IndexError: invalid index to scalar variable.

In [28]:
import numpy as np 

In [71]:
import os
import glob

In [None]:
tagged_files = glob.glob('../app/data/*-marke*')
tags = []
for f in tagged_files:
    tags.extend(json.load(open(f)))

tags_df = pd.DataFrame(tags)
legit_tagged_sources = (
    tags_df
        .groupby(['doc_id', 'person'])[['question_class', 'label']]
        .apply(lambda df: dict(df.itertuples(index=False)))
        .apply(pd.Series)
        .fillna('')
        .loc[lambda df: df['error'] == 'legit']
)

legit_tagged_sources = (
    legit_tagged_sources
        .reset_index()
        .assign(person=lambda df: df['person'].str.replace('-', ' '))
        .set_index(['doc_id', 'person'])
)

role = (
    legit_tagged_sources
        [list(filter(lambda x: '-role' in x, legit_tagged_sources.columns))]
        .apply(lambda x: x[x != ''][0], axis=1)
       )

affil = legit_tagged_sources['affiliation']
legit_tags = affil + '-' + role



with open('../models/topic_model/input_data/roles.txt', 'w') as f:
    for tag in legit_tags.unique():
        f.write(tag)
        f.write('\n')

label2l_id_map = {v:k for k,v in enumerate(legit_tags.unique())}

s_id2label = defaultdict(dict)
for doc_id, person, role in legit_tags.reset_index().itertuples(index=False):
    s_id2source = doc_idx_to_chunks[doc_id]['source_map']
    source2s_id = {v:k for k,v in s_id2source.items()}
    if person in source2s_id:
        s_id = source2s_id[person]
        s_id2label[doc_id][s_id] = label2l_id_map[role]

In [165]:
legit_tags.to_frame('label').reset_index().to_csv('../models/topic_model/input_data/labels.csv', index=False)

In [113]:
legit_tagged_sources.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,error,affiliation,gov-branch,gov-person-role,victim-person-role,academic-person-role,witness-person-role,actor-affiliation-type,actor-individual-person-role,actor-group-person-role,corporate-person-role,lawyer-role,ngo-person-role,stance,current
doc_id,person,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1dc4d8b2-c916-11e9-a719-b831b5755f6c,Alan Cranston,legit,government,legislative,decision-maker,,,,,,,,,,opposing,
1dc4d8b2-c916-11e9-a719-b831b5755f6c,Albert R. Brashear,legit,government,executive,spokesman,,,,,,,,,,supporting,
1dc4d8b2-c916-11e9-a719-b831b5755f6c,Bob Dole,legit,government,legislative,decision-maker,,,,,,,,,,informational,
1dc4d8b2-c916-11e9-a719-b831b5755f6c,Byrd,legit,government,legislative,decision-maker,,,,,,,,,,supporting,
1dc4d8b2-c916-11e9-a719-b831b5755f6c,Edmund S. Muskie,legit,government,legislative,decision-maker,,,,,,,,,,opposing,


# Explore Early Results

In [125]:
import pickle

In [139]:
from sampler import BOW_Source_GibbsSampler

In [141]:
sampler = pickle.load(open('../models/topic_model/trained-sampler.pkl', 'rb'))

In [204]:
pd.DataFrame(sampler.sourcetype_by_wordtopic__wordtopic_counts)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1193.0,917.0,846.0,1261.0,1103.0,1014.0,1690.0,1236.0,1513.0,1001.0,...,1201.0,1056.0,1018.0,1545.0,778.0,889.0,990.0,908.0,927.0,650.0
1,5582.0,6687.0,5931.0,6785.0,8520.0,6537.0,6423.0,6512.0,6396.0,4974.0,...,5972.0,7806.0,6232.0,7073.0,7233.0,7756.0,7211.0,5671.0,6672.0,7868.0
2,4510.0,6494.0,4916.0,5365.0,5136.0,4776.0,6791.0,3971.0,4826.0,3780.0,...,5699.0,6049.0,5430.0,4700.0,5214.0,3931.0,6282.0,4973.0,5121.0,5967.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0
4,2674.0,4399.0,2869.0,2786.0,3028.0,2653.0,2531.0,2644.0,4938.0,2065.0,...,2993.0,3456.0,3287.0,2039.0,3754.0,2318.0,3256.0,2701.0,3579.0,4167.0
5,0.0,0.0,0.0,2.0,1.0,4.0,1.0,0.0,0.0,2.0,...,1.0,1.0,5.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0
6,6.0,0.0,10.0,12.0,6.0,10.0,18.0,2.0,4.0,0.0,...,12.0,19.0,1.0,8.0,43.0,5.0,3.0,20.0,11.0,15.0
7,1958.0,1302.0,1149.0,1588.0,1056.0,1595.0,1015.0,1486.0,1338.0,1288.0,...,1557.0,1164.0,1589.0,1479.0,1436.0,1474.0,1164.0,1064.0,1351.0,1054.0
8,4774.0,6315.0,6920.0,7924.0,7930.0,6835.0,6405.0,6745.0,6609.0,4890.0,...,6867.0,6395.0,7053.0,7040.0,7655.0,8492.0,6537.0,7122.0,6613.0,7377.0
9,2555.0,2123.0,3132.0,2244.0,2353.0,2615.0,2565.0,2693.0,2440.0,3035.0,...,2935.0,3094.0,2317.0,3448.0,4268.0,2599.0,4098.0,2960.0,2276.0,3802.0


In [239]:
topic_df = pd.DataFrame(sampler.vocab_by_wordtopic__word_counts, index=vocab)

In [240]:
topic_df = topic_df/topic_df.sum(axis=0)

In [241]:
import numpy as np 
from scipy.stats import entropy
from IPython.display import HTML

## get top words by topic
top_words_by_topic = []
for i in topic_df.columns:
    sorted_topics = topic_df[i].sort_values(ascending=False).iloc[10:20]
    top_words_by_topic.append(list(sorted_topics.index))

## reformat into an HTML table for easier reading
ncols = 10
table = []
row = []
for topic, words in enumerate(top_words_by_topic):
    subtable_header = '<th>%s</th>' % words[0].upper()
    subtable_body = ''.join(list(map(lambda elem: '<tr><td>%s</td></tr>' % elem, words[1:])))
    subtable = '<table><tr>%s</tr>%s</table>' % (subtable_header, subtable_body)
    row.append(subtable)    
    if (topic % ncols == (ncols - 1)) or (topic == len(top_words_by_topic) - 1) :
        table.append('\n'.join(list(map(lambda elem: '<td>%s</td>' % elem, row))))
        row = []

In [242]:
sorted_topic_df = pd.DataFrame(top_words_by_topic).T
header = sorted_topic_df.loc[0].values
sorted_topic_df = sorted_topic_df.iloc[1:,:]
sorted_topic_df.columns= header

In [243]:
HTML('<table>' + ''.join(list(map(lambda row: '<tr>%s</tr>' % row, table))) + '</table>')

0,1,2,3,4,5,6,7,8,9
BUTdobywhatthanwhichshemoremostthis,BUTwouldfromdoaboutmorewemosttelllike,ITbutmoreonemakewhichyearwhatwhetherfrom,BYbutdoabouttellsomewouldcampaignwhichtime,BUTatdowilloneyearwhoothermakethis,WOULDbyonedofrommeetingmoreallofficialwe,ATwouldmakedoaboutbutwhenaftermorewe,BUTitaboutwedowouldyearafterorattack,DOfromwhenbutafterthisyearweorwould,FROMbutdotodayaboutoneorwouldthanhere
BYdoitwhenwillmorebutthisofficialpeople,AFTERyearwouldbybutoverfromalsowilldo,YEARbydobutwecouldwouldwhenaboutthis,WHOdobyotherwhenafterorbutabouttime,WEbyitafterbutdowouldonetellhelp,WHOwoulddofromonebutaboutcallwhatwhen,FROMweaboutonethistheyyearthinktellafter,ABOUTcouldwhowouldatdomorewhichofficialwhen,WOULDwhobutwillwefromstateaboutoneget,WOULDdofromotheryearweitwhenafterwill
ITSaboutdothiswhosheofficialupbutmake,ITwouldaswhobeforeyearlastthisthereany,BYthisatwouldoverwewhatwhenmoremake,FROMtheybywhenabouttakethistodaymaketwo,THEYbutdowouldyearthiswhenoutafterwe,,,,,

BUT
do
by
what
than
which
she
more
most
this

BUT
would
from
do
about
more
we
most
tell
like

IT
but
more
one
make
which
year
what
whether
from

BY
but
do
about
tell
some
would
campaign
which
time

BUT
at
do
will
one
year
who
other
make
this

WOULD
by
one
do
from
meeting
more
all
official
we

AT
would
make
do
about
but
when
after
more
we

BUT
it
about
we
do
would
year
after
or
attack

DO
from
when
but
after
this
year
we
or
would

FROM
but
do
today
about
one
or
would
than
here

BY
do
it
when
will
more
but
this
official
people

AFTER
year
would
by
but
over
from
also
will
do

YEAR
by
do
but
we
could
would
when
about
this

WHO
do
by
other
when
after
or
but
about
time

WE
by
it
after
but
do
would
one
tell
help

WHO
would
do
from
one
but
about
call
what
when

FROM
we
about
one
this
they
year
think
tell
after

ABOUT
could
who
would
at
do
more
which
official
when

WOULD
who
but
will
we
from
state
about
one
get

WOULD
do
from
other
year
we
it
when
after
will

ITS
about
do
this
who
she
official
up
but
make

IT
would
as
who
before
year
last
this
there
any

BY
this
at
would
over
we
what
when
more
make

FROM
they
by
when
about
take
this
today
make
two

THEY
but
do
would
year
this
when
out
after
we


In [201]:
(pd.Series(sampler.source_to_source_type)
 .reset_index()
 .rename(columns={'level_0': 'doc_idx', 'level_1': 'source_id', 0: 'source_type'})
)

Unnamed: 0,doc_idx,source_id,source_type
0,0,S_0_0,8
1,1,S_1_0,18
2,1,S_1_1,18
3,1,S_1_2,15
4,1,S_1_3,15
5,1,S_1_4,15
6,2,S_2_0,10
7,2,S_2_1,10
8,3,S_3_0,0
9,3,S_3_1,0


# Labels

In [254]:
os.listdir('../models/topic_model/trained-models/')

['trained-sampler-with-labels-with-stopwords.pkl',
 'trained-sampler-with-stopwords.pkl']

In [256]:
labeled_sampler = pickle.load(open('../models/topic_model/trained-models/trained-sampler-with-labels-with-stopwords.pkl', 'rb'))

In [262]:
roles = open('../models/topic_model/input_data/roles.txt').read().split('\n')

In [270]:
doctype_by_sourcetype = pd.DataFrame(labeled_sampler.sourcetype_by_doctype__source_counts, columns=roles)

In [272]:
doctype_by_sourcetype.head()

Unnamed: 0,government-decision-maker,government-spokesman,government-advisor,academic-other,actor-doctor,corporate-lawyer,corporate-spokesman,ngo-expert,academic-actor,actor-member,...,actor-leader,government-lawyer,witness-casual,actor-lawyer,actor-individual,corporate-decision-maker,victim-relative,government-witness,corporate-victim,Unnamed: 21
0,21.0,2.0,3.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,529.0,5.0,2.0,2.0,1.0,1.0,0.0,545.0,1.0,0.0,...,0.0,16.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,0.0
3,12.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [274]:
sourcetype_by_topic = pd.DataFrame(labeled_sampler.sourcetype_by_wordtopic__wordtopic_counts, index=roles).T

In [283]:
for col in sourcetype_by_topic.columns:
    print(sourcetype_by_topic[col].sort_values(ascending=False).head())

15    5586.0
7     3746.0
20    3466.0
23    2784.0
24    2584.0
Name: government-decision-maker, dtype: float64
15    2144.0
7     1554.0
20    1009.0
23     960.0
14     889.0
Name: government-spokesman, dtype: float64
15    3101.0
23    2608.0
20    2597.0
18    2335.0
7     2310.0
Name: government-advisor, dtype: float64
18    85.0
23    66.0
15    51.0
14    50.0
16    45.0
Name: academic-other, dtype: float64
4     41.0
15    25.0
20    21.0
23    18.0
2     16.0
Name: actor-doctor, dtype: float64
24    1082.0
15    1010.0
23    1010.0
20     974.0
18     972.0
Name: corporate-lawyer, dtype: float64
21    1370.0
0     1154.0
23    1060.0
7     1014.0
15     953.0
Name: corporate-spokesman, dtype: float64
15    5019.0
20    3120.0
12    2514.0
4     2479.0
0     2187.0
Name: ngo-expert, dtype: float64
15    2783.0
24    2386.0
7     2205.0
14    1905.0
2     1800.0
Name: academic-actor, dtype: float64
20    59.0
16    55.0
8     23.0
15    18.0
5     13.0
Name: actor-member, dtype

In [281]:
source_to_sourcetype_df = (pd.Series(sampler.source_to_source_type)
     .apply(lambda x: roles[x])
     .reset_index()
     .rename(columns={'level_0':'doc', 'level_1':'source_id', 0:'source_role'})
)

In [303]:
len(os.listdir('../data/news-article-flatlist/stanford-parses/0'))

2573