In [3]:
import pandas as pd
import glob, os, sys
import xml.etree.ElementTree as ET
from tqdm import tqdm_notebook as tqdm
from importlib import reload
import numpy as np
from matplotlib import pyplot as plt
import json
from collections import defaultdict

In [5]:
sys.path.append('../models/topic_model/')

In [6]:
import pickle
from sampler import BOW_Source_GibbsSampler

# Explore Early Results

In [396]:
sampler = pickle.load(open('../models/topic_model/trained-sampled-iter-40.pkl', 'rb'))

In [397]:
vocab = open('../models/topic_model/input_data/vocab.txt').read().split('\n')
topic_df = pd.DataFrame(sampler.vocab_by_wordtopic__word_counts, index=vocab)
topic_df = topic_df/topic_df.sum(axis=0)

In [398]:
import numpy as np 
from scipy.stats import entropy
from IPython.display import HTML

## get top words by topic
top_words_by_topic = []
for i in topic_df.columns:
    sorted_topics = topic_df[i].sort_values(ascending=False).iloc[10:20]
    top_words_by_topic.append(list(sorted_topics.index))

## reformat into an HTML table for easier reading
ncols = 10
table = []
row = []
for topic, words in enumerate(top_words_by_topic):
    subtable_header = '<th>%s</th>' % words[0].upper()
    subtable_body = ''.join(list(map(lambda elem: '<tr><td>%s</td></tr>' % elem, words[1:])))
    subtable = '<table><tr>%s</tr>%s</table>' % (subtable_header, subtable_body)
    row.append(subtable)    
    if (topic % ncols == (ncols - 1)) or (topic == len(top_words_by_topic) - 1) :
        table.append('\n'.join(list(map(lambda elem: '<td>%s</td>' % elem, row))))
        row = []

In [399]:
sorted_topic_df = pd.DataFrame(top_words_by_topic).T
header = sorted_topic_df.loc[0].values
sorted_topic_df = sorted_topic_df.iloc[1:,:]
sorted_topic_df.columns= header

# Top Topics

In [400]:
HTML('<table>' + ''.join(list(map(lambda row: '<tr>%s</tr>' % row, table))) + '</table>')

0,1,2,3,4,5,6,7,8,9
WORKproposalschoolissueattackcourtsecurityneedlinehospital,JUSTpoliticalleadermeetingthinkheadofficeprogramhealthbelieve,LIKEmoneypowerchangeproblemjustweekpeoplerequireexpect,KNOWmakemonthpricetimeseniorchangeneedsettlementlegislation,TIMEdecisionpublictellvoteteamgovernmentevidencecityhelp,INTERVIEWworktimechiefspokesmanchairmangroupstatecompanyway,COMEmeetforceplanofficeschoolchildkillcountrygovernment,COMPANYforceissuegovernmentagreeallylawyerdemandmeasureday,OFFICIALtodaybaseusecomeknowhomecompanythinkmake,TODAYforcemakegovernmentpoliticalleadbeginhelpchildpeople
TIMEofficewomantodaygovernmentappearmanneedforcelaw,SPOKESMANexecutivepercentlawyerjusttodaycampaignrecallcasething,DRUGamericandaypoliticalmakeincludegroupplandecisionincrease,LIKEtimewaywarjuststatebusinessincludelocalremain,STATEmillionnuclearcomelikeofficialdeathreportthinksell,MEMBERpercenttimecompanyfamilyhomeleadermonthstatecome,WORKfaceruleincluderateplanmillionreportmakeprovide,BEGINindustrymarkettodaymilitaryofferpeoplereportmonthlead,WEEKclosegovernmentdecisionworkervotepeopleuseleavegroup,PEOPLEplanleaveknowthinkbelieveincludegovernmentissuetime
LAWmillionvotewaycourteffortbeginplannationbillion,LEAVEmakeleadpaycasepercentbillioncityreportrecent,MILITARYpolicyschoolplanpoliceattackreportcasepoliticalend,AMERICANissuetodaydayworkcountrycityagencyaddcompany,PLANtodayleavetowngovernmentreportcostleaderpoliticaljust,,,,,

WORK
proposal
school
issue
attack
court
security
need
line
hospital

JUST
political
leader
meeting
think
head
office
program
health
believe

LIKE
money
power
change
problem
just
week
people
require
expect

KNOW
make
month
price
time
senior
change
need
settlement
legislation

TIME
decision
public
tell
vote
team
government
evidence
city
help

INTERVIEW
work
time
chief
spokesman
chairman
group
state
company
way

COME
meet
force
plan
office
school
child
kill
country
government

COMPANY
force
issue
government
agree
ally
lawyer
demand
measure
day

OFFICIAL
today
base
use
come
know
home
company
think
make

TODAY
force
make
government
political
lead
begin
help
child
people

TIME
office
woman
today
government
appear
man
need
force
law

SPOKESMAN
executive
percent
lawyer
just
today
campaign
recall
case
thing

DRUG
american
day
political
make
include
group
plan
decision
increase

LIKE
time
way
war
just
state
business
include
local
remain

STATE
million
nuclear
come
like
official
death
report
think
sell

MEMBER
percent
time
company
family
home
leader
month
state
come

WORK
face
rule
include
rate
plan
million
report
make
provide

BEGIN
industry
market
today
military
offer
people
report
month
lead

WEEK
close
government
decision
worker
vote
people
use
leave
group

PEOPLE
plan
leave
know
think
believe
include
government
issue
time

LAW
million
vote
way
court
effort
begin
plan
nation
billion

LEAVE
make
lead
pay
case
percent
billion
city
report
recent

MILITARY
policy
school
plan
police
attack
report
case
political
end

AMERICAN
issue
today
day
work
country
city
agency
add
company

PLAN
today
leave
town
government
report
cost
leader
political
just


In [401]:
roles = open('../models/topic_model/input_data/roles.txt').read().split('\n')
roles[-1] = 'other'

# Topics per Source-Type

In [402]:
sourcetype_by_topic = pd.DataFrame(sampler.sourcetype_by_wordtopic__wordtopic_counts, index=roles).T

PMI:

$$pmi(x, y) = log \frac{p(x, y)}{p(x)p(y)}$$

In [403]:
p_topic = sourcetype_by_topic.sum(axis=1).pipe(lambda s: s/s.sum())
p_source = sourcetype_by_topic.sum(axis=0).pipe(lambda s: s/s.sum())

In [404]:
p_source_topic = sourcetype_by_topic / sourcetype_by_topic.sum().sum()

In [405]:
pmi_sourcetype = pd.DataFrame(columns=sourcetype_by_topic.columns, index=sourcetype_by_topic.index)

In [406]:
p_source_topic[source][topic]

0.0003829704249442658

In [407]:
for source in sourcetype_by_topic.columns:
    for topic in sourcetype_by_topic.index:
        pmi_sourcetype.at[topic, source] = np.log(p_source_topic[source][topic]) - np.log(p_topic[topic]) - np.log(p_source[source])

In [408]:
for col in sourcetype_by_topic.columns[:8]:
    print(col)
    for topic in (
#         sourcetype_by_topic
#             .pipe(lambda df: df / df.sum(axis=0))[col]
    pmi_sourcetype[col]
        .sort_values(ascending=False)
            .head(3)
            .index
    ):
        print(
            '    topic ' + str(topic) + ': ' + 
                ', '.join([sorted_topic_df.columns[topic]] + sorted_topic_df.iloc[:, topic].tolist()[:2]) + 
                '...'
        )
    print()
    print()
print('...')

government-decision-maker
    topic 7: company, force, issue...
    topic 16: work, face, rule...
    topic 18: week, close, government...


government-spokesman
    topic 9: today, force, make...
    topic 21: leave, make, lead...
    topic 22: military, policy, school...


government-advisor
    topic 16: work, face, rule...
    topic 3: know, make, month...
    topic 20: law, million, vote...


academic-other
    topic 16: work, face, rule...
    topic 21: leave, make, lead...
    topic 22: military, policy, school...


actor-doctor
    topic 20: law, million, vote...
    topic 12: drug, american, day...
    topic 6: come, meet, force...


corporate-lawyer
    topic 13: like, time, way...
    topic 18: week, close, government...
    topic 16: work, face, rule...


corporate-spokesman
    topic 22: military, policy, school...
    topic 16: work, face, rule...
    topic 2: like, money, power...


ngo-expert
    topic 13: like, time, way...
    topic 22: military, policy, school...
   

# Source-types per document

In [409]:
docid2idx_map = list(map(lambda x: x['doc_id'],  sampler.docs))

doc_to_source_map = {}
for doc in sampler.docs:
    if doc['doc_id'] not in doc_to_source_map:
        doc_to_source_map[doc['doc_id']] = doc['source_map']

source_to_sourcetype_df = (pd.Series(sampler.source_to_source_type)
     .apply(lambda x: roles[x])
     .reset_index()
     .rename(columns={'level_0':'doc_idx', 'level_1':'source_id', 0:'source_role'})
     .assign(doc_id=lambda df: df['doc_idx'].apply(lambda x: docid2idx_map[x]))
     .assign(source_name=lambda df: df.apply(lambda x: doc_to_source_map[x['doc_id']].get(x['source_id']), axis=1))
     .loc[lambda df: df['source_name'].notnull()]
)

In [412]:
source_to_sourcetype_df.head()

Unnamed: 0,doc_idx,source_id,source_role,doc_id,source_name
0,0,S_0_0,academic-actor,1dab5d59-c916-11e9-a6c2-b831b5755f6c,David R. Obey
1,1,S_1_0,actor-doctor,1dab5d62-c916-11e9-ba39-b831b5755f6c,John Cardinal O'Connor
2,1,S_1_1,ngo-actor,1dab5d62-c916-11e9-ba39-b831b5755f6c,Chaim Herzog
3,1,S_1_2,government-advisor,1dab5d62-c916-11e9-ba39-b831b5755f6c,Yitzhak Shamir
4,1,S_1_3,corporate-decision-maker,1dab5d62-c916-11e9-ba39-b831b5755f6c,Teddy Kollek


In [421]:
samples_to_labels = (source_to_sourcetype_df
 .merge(legit_tags.to_frame('label').reset_index(), left_on=['doc_id', 'source_name'], right_on=['doc_id', 'person'])
)[['source_role', 'label']]

# Doc-Type per Source-Type

In [35]:
doctype_by_sourcetype = pd.DataFrame(sampler.sourcetype_by_doctype__source_counts, columns=roles)
doctype_by_sourcetype.head()

Unnamed: 0,government-decision-maker,government-spokesman,government-advisor,academic-other,actor-doctor,corporate-lawyer,corporate-spokesman,ngo-expert,academic-actor,actor-member,...,actor-leader,government-lawyer,witness-casual,actor-lawyer,actor-individual,corporate-decision-maker,victim-relative,government-witness,corporate-victim,other
0,487.0,237.0,229.0,333.0,203.0,210.0,164.0,153.0,125.0,260.0,...,284.0,167.0,281.0,172.0,183.0,158.0,147.0,200.0,156.0,391.0
1,334.0,209.0,218.0,273.0,279.0,302.0,244.0,333.0,289.0,258.0,...,454.0,213.0,210.0,216.0,179.0,255.0,202.0,183.0,158.0,395.0
2,239.0,132.0,187.0,329.0,364.0,242.0,153.0,283.0,403.0,326.0,...,354.0,229.0,272.0,188.0,289.0,181.0,224.0,213.0,157.0,284.0
3,471.0,188.0,316.0,353.0,317.0,285.0,263.0,616.0,433.0,378.0,...,416.0,450.0,297.0,212.0,424.0,267.0,196.0,240.0,310.0,471.0
4,358.0,209.0,230.0,271.0,333.0,369.0,174.0,261.0,203.0,219.0,...,455.0,285.0,292.0,246.0,397.0,218.0,253.0,209.0,195.0,470.0


# Label for Validation

In [204]:
doc_html = []
with open('../data/news-article-flatlist/html-for-sources/doc_html.json') as f:
    for line in f.readlines():
        doc_html.append(json.loads(line.strip()))

In [205]:
doc_id2html = {}
for html in doc_html:
    doc_id2html[html['doc_id']] = html

In [206]:
from IPython.display import HTML

In [207]:
HTML(doc_html[13]['html'])

# Manual Check

In [213]:
marked_files = glob.glob('../app/data/batch-marked*')

In [222]:
tags = []
for marked_file in marked_files:
    tags.extend(json.load(open(marked_file)))

In [223]:
## load them into a dataframe and parse out nonlegit sources.
tags_df = pd.DataFrame(tags)
legit_tagged_sources = (
    tags_df
        .groupby(['doc_id', 'person'])[['question_class', 'label']]
        .apply(lambda df: dict(df.itertuples(index=False)))
        .apply(pd.Series)
        .fillna('')
        .loc[lambda df: df['error'] == 'legit']
)
legit_tagged_sources = (
    legit_tagged_sources
        .reset_index()
        .assign(person=lambda df: df['person'].str.replace('-', ' '))
        .set_index(['doc_id', 'person'])
)
role = (
    legit_tagged_sources
        [list(filter(lambda x: '-role' in x, legit_tagged_sources.columns))]
        .apply(lambda x: x[x != ''][0], axis=1)
)
affil = legit_tagged_sources['affiliation']
legit_tags = affil + '-' + role

In [237]:
model_labeled = {}
with open('../data/news-article-flatlist/model-labeled-sources/doc_html.json') as f:
    for line in f.readlines():
        doc = json.loads(line.strip())
        doc.pop('doc_sentences')
        doc_id = doc['doc_id']
        model_labeled[doc_id] = doc

In [265]:
legit_tags_df = legit_tags.reset_index()

In [267]:
legit_tags_df.loc[lambda df: df['person'].str.contains('David')]

Unnamed: 0,doc_id,person,0
58,2afdb68a-c916-11e9-a2b5-b831b5755f6c,David E. Skaggs,government-decision-maker
107,4dceffe9-c916-11e9-b704-b831b5755f6c,Raighne Davidson,witness-casual
128,5bafe4bc-c916-11e9-b06e-b831b5755f6c,David Bar Illan,government-spokesman
172,8b1609d8-c916-11e9-b5c1-b831b5755f6c,David Ritz,academic-other


In [269]:
input_data_df = pd.DataFrame(sampler.docs)

In [279]:
input_data_df['source_labels'].apply(lambda x: len(x)).value_counts()

0    27428
Name: source_labels, dtype: int64

# Load new processed data

In [288]:
import sys

In [289]:
sys.path.append('../models/topic_model/')

In [295]:
import process_data_for_tm

In [353]:
label_dir = '../data/news-article-flatlist/labels'
role_file = '../models/topic_model/input_data_with_labels/roles.txt'

In [None]:
process_data_for_tm.read_roles()

In [371]:
processed_texts = []
with open('../models/topic_model/input_data_with_labels/doc_source.json') as f:
    for line in f.readlines():
        processed_texts.append(json.loads(line.strip()))

In [356]:
s_2_label_id = process_data_for_tm.read_roles(label_dir, role_file)

In [363]:
for text in processed_texts:
    doc_id = text['doc_id']
    ## attach labels to document
    text['source_labels'] = {}
    if doc_id in s_2_label_id:
        source_name_2_labels = s_2_label_id[doc_id]
        source_name_2_id = {v:k for k,v in text['source_map'].items()}
        for source_name, label_id in source_name_2_labels.items():
            if source_name in source_name_2_id:
                text['source_labels'][source_name_2_id[source_name]] = label_id

In [370]:
with open('../models/topic_model/input_data_with_labels/doc_source.json', 'w') as f:
    for text in processed_texts:
        f.write(json.dumps(text))
        f.write('\n')

In [464]:
from tqdm import tqdm_notebook as tqdm

In [524]:
parsed_htmls = []
with open('../data/news-article-flatlist/html-for-sources/doc_html.json') as f:
    for line in tqdm(f.readlines()):
        parsed_html = json.loads(line.strip())
        if parsed_html['doc_id'] in final_docs:
            parsed_html['done'] = False
            k = client.key('source-annotation-unmarked', parsed_html['doc_id'] + '-' + parsed_html['person'])
            e = datastore.Entity(k, exclude_from_indexes=['sentence_ids', 'html'])
            e.update(parsed_html)
            client.put(e)
            
            parsed_htmls.append(parsed_html)

HBox(children=(IntProgress(value=0, max=8028), HTML(value='')))

In [459]:
e = datastore.Entity(k, exclude_from_indexes=['sentence_ids', 'html'])

In [460]:
e.update(parsed_html)

In [453]:
from google.cloud import datastore

In [455]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:/Users/alexa/google-cloud/usc-research-c087445cf499.json'

client = datastore.Client()

In [None]:
client.key()

In [532]:
q = client.query(kind='source-annotation-unmarked')
q.add_filter('done', '=', False)

In [533]:
unfinished = list(q.fetch())

In [536]:
q = client.query(kind='source-annotation-marked')
marked = list(q.fetch())

In [566]:
template_df = pd.DataFrame(index=t_df.index, columns=t_df.columns).assign(question_class=t_df['question_class'])

In [603]:
affil_role_mapper = {'government': 2,
 'victim': 3,
 'academic': 4,
 'witness': 5,
 'corporate': 9,
 'ngo': 10}

def make_datapoint(doc_id, person, affiliation, role):
    output_df = template_df.copy()
    output_df['doc_id'] = doc_id
    output_df['person'] = person
    
    ###
    output_df.at[0, 'label'] = 'legit'
    output_df.at[1, 'label'] = affiliation
    output_df.at[affil_role_mapper[affiliation], 'label'] = role
    
    ##
    output_dict = output_df.to_dict(orient='rows')
    
    ## pop nans:
    for datum in output_dict:
        to_pop = []
        for key in datum:
            if isinstance(datum[key], float) and np.isnan(datum[key]):
                to_pop.append(key)
        for key in to_pop:
            datum.pop(key)

    return output_dict

In [631]:
t1 = list(filter(is_academic_expert, parsed_htmls))
display(HTML(t1[2]['html'].replace('$', '')))

In [None]:
is_academic_expert = lambda x: 'university' in x['html'].lower() and 'professor' in x['html'].lower()

In [646]:
def is_government_decision_maker(x):
    text = x['html'].lower()
    if 'senator' in text:
        return True
    if 'representative' in text:
        if 'democrat' in text or 'republican' in text:
            return True
    if (
        ('prime minister' in text or 'white house' in text or 'president' in text) 
            and 
        ('advisor' not in text and 'spokesman' not in text and 'spokeswoman' not in text and 'company' not in text and 'board' not in text)
    ):
        return True
    else:
        return False

In [None]:
t1 = list(filter(is_academic_expert, parsed_htmls))
display(HTML(t1[2]['html'].replace('$', '')))

In [648]:
t2 = list(filter(is_government_decision_maker, unfinished))

In [650]:
display(HTML(t2[4]['html'].replace('$', '')))

# some clean up

In [468]:
t = client.query()
all_entities= list(t.fetch())
k = all_entities[0].key
t = list(map(lambda k: k.key, all_entities))
long_keys =  list(filter(lambda x: len(x.flat_path)>2, t))
for key in long_keys:
    client.delete(key)