In [2]:
import pandas as pd
import glob, os, sys
import xml.etree.ElementTree as ET
from tqdm import tqdm_notebook as tqdm
sys.path.append('../scripts/')
import util
import parsing_util
from importlib import reload
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import consensus_score
import json
from collections import defaultdict

# Explore Early Results

In [329]:
import pickle

In [330]:
from sampler import BOW_Source_GibbsSampler

In [331]:
sampler = pickle.load(open('../models/topic_model/trained-sampler-with-labels.pkl', 'rb'))

In [335]:
vocab = open('../models/topic_model/input_data/vocab.txt').read().split('\n')

In [336]:
topic_df = pd.DataFrame(sampler.vocab_by_wordtopic__word_counts, index=vocab)

In [337]:
topic_df = topic_df/topic_df.sum(axis=0)

In [338]:
import numpy as np 
from scipy.stats import entropy
from IPython.display import HTML

## get top words by topic
top_words_by_topic = []
for i in topic_df.columns:
    sorted_topics = topic_df[i].sort_values(ascending=False).iloc[10:20]
    top_words_by_topic.append(list(sorted_topics.index))

## reformat into an HTML table for easier reading
ncols = 10
table = []
row = []
for topic, words in enumerate(top_words_by_topic):
    subtable_header = '<th>%s</th>' % words[0].upper()
    subtable_body = ''.join(list(map(lambda elem: '<tr><td>%s</td></tr>' % elem, words[1:])))
    subtable = '<table><tr>%s</tr>%s</table>' % (subtable_header, subtable_body)
    row.append(subtable)    
    if (topic % ncols == (ncols - 1)) or (topic == len(top_words_by_topic) - 1) :
        table.append('\n'.join(list(map(lambda elem: '<td>%s</td>' % elem, row))))
        row = []

In [339]:
sorted_topic_df = pd.DataFrame(top_words_by_topic).T
header = sorted_topic_df.loc[0].values
sorted_topic_df = sorted_topic_df.iloc[1:,:]
sorted_topic_df.columns= header

# Top Topics

In [340]:
HTML('<table>' + ''.join(list(map(lambda row: '<tr>%s</tr>' % row, table))) + '</table>')

0,1,2,3,4,5,6,7,8,9
WORKlikeofficialdirectorthinkleadchiefpresidentaskpercent,NEWdirectorpresidentsupportknowamericantodaystategroupway,CITYleaddayneedaddquestionforcepoliticalreportcountry,KNOWofficialcityagencyneedgroupleadtodaylawyerjust,PERCENTissuemilitarysovietcountrydaynumberrecenthelpamerican,LEADERstatereportissuecomenewchargedaythinksoviet,CASEcityworknewcompanycomepercentthinkaddtime,AMERICANcitytalkplanpolicyusestateleadermilliondemocratic,MILLIONtimepartyplanworkforcecountryreportbeginday,COMEtimeusejustcasegovernmentgrouppoliticalforcework
DIRECTORthinktellpresidentstatetodaytimeweekusepeople,CITYissuereportpercenttalkcountrymembergrouprunpeople,INCREASEpeopleuseweekcomecampaignresultadministrationleaderissue,OFFICIALlikecountryplanprogramchairmanstatepoliticalpresidentweek,LAWstatemembergovernmentnewuseleadbusinessdayend,TODAYpeopleofficepoliticalvoteissueamericanknowweekmonth,ATTACKcontrolstateworkamericanwayasklawgovernmentpercent,LIKEusepeoplelawplanbudgetpublicincreaseaccordcontinue,MAKEpeopleleadpowerbillioncitymilitaryaccordcountrygovernment,LIKEmakecompanypresidentpercentdecisionthinktryinterviewchief
AMERICANknowcountrycomegrouppeoplereportnewtaxmember,KNOWusedirectorwantleadthinkcompanyissueaddwork,COMEpercentsovietmeetingmillionamericanplangovernmenttrywoman,LAWleadusepoliticalmannewstatesupportweekdirector,PEOPLEstateleadpaymakepoliticalforcechargejustfar,,,,,

WORK
like
official
director
think
lead
chief
president
ask
percent

NEW
director
president
support
know
american
today
state
group
way

CITY
lead
day
need
add
question
force
political
report
country

KNOW
official
city
agency
need
group
lead
today
lawyer
just

PERCENT
issue
military
soviet
country
day
number
recent
help
american

LEADER
state
report
issue
come
new
charge
day
think
soviet

CASE
city
work
new
company
come
percent
think
add
time

AMERICAN
city
talk
plan
policy
use
state
leader
million
democratic

MILLION
time
party
plan
work
force
country
report
begin
day

COME
time
use
just
case
government
group
political
force
work

DIRECTOR
think
tell
president
state
today
time
week
use
people

CITY
issue
report
percent
talk
country
member
group
run
people

INCREASE
people
use
week
come
campaign
result
administration
leader
issue

OFFICIAL
like
country
plan
program
chairman
state
political
president
week

LAW
state
member
government
new
use
lead
business
day
end

TODAY
people
office
political
vote
issue
american
know
week
month

ATTACK
control
state
work
american
way
ask
law
government
percent

LIKE
use
people
law
plan
budget
public
increase
accord
continue

MAKE
people
lead
power
billion
city
military
accord
country
government

LIKE
make
company
president
percent
decision
think
try
interview
chief

AMERICAN
know
country
come
group
people
report
new
tax
member

KNOW
use
director
want
lead
think
company
issue
add
work

COME
percent
soviet
meeting
million
american
plan
government
try
woman

LAW
lead
use
political
man
new
state
support
week
director

PEOPLE
state
lead
pay
make
political
force
charge
just
far


In [344]:
roles = open('../models/topic_model/input_data/roles.txt').read().split('\n')

In [352]:
roles[-1] = 'other'

# Topics per Source-Type

In [355]:
sourcetype_by_topic = pd.DataFrame(sampler.sourcetype_by_wordtopic__wordtopic_counts, index=roles).T

In [382]:
for col in sourcetype_by_topic.columns[:8]:
    print(col)
    for topic in sourcetype_by_topic[col].sort_values(ascending=False).head(3).index:
        print(
            '    topic ' + str(topic) + ': ' + 
                ', '.join([sorted_topic_df.columns[topic]] + sorted_topic_df.iloc[:, topic].tolist()[:2]) + 
                '...'
        )
    print()
    print()
print('...')

government-decision-maker
    topic 9: come, time, use...
    topic 3: know, official, city...
    topic 10: director, think, tell...


government-spokesman
    topic 19: like, make, company...
    topic 3: know, official, city...
    topic 0: work, like, official...


government-advisor
    topic 0: work, like, official...
    topic 19: like, make, company...
    topic 3: know, official, city...


academic-other
    topic 7: american, city, talk...
    topic 19: like, make, company...
    topic 13: official, like, country...


actor-doctor
    topic 9: come, time, use...
    topic 20: american, know, country...
    topic 6: case, city, work...


corporate-lawyer
    topic 0: work, like, official...
    topic 10: director, think, tell...
    topic 19: like, make, company...


corporate-spokesman
    topic 0: work, like, official...
    topic 13: official, like, country...
    topic 9: come, time, use...


ngo-expert
    topic 20: american, know, country...
    topic 0: work, like, offi

# Source-types per document

In [358]:
source_to_sourcetype_df = (pd.Series(sampler.source_to_source_type)
     .apply(lambda x: roles[x])
     .reset_index()
     .rename(columns={'level_0':'doc', 'level_1':'source_id', 0:'source_role'})
)

In [359]:
source_to_sourcetype_df

Unnamed: 0,doc,source_id,source_role
0,0,S_0_0,government-spokesman
1,1,S_1_0,other
2,1,S_1_1,academic-actor
3,1,S_1_2,academic-actor
4,1,S_1_3,witness-casual
5,1,S_1_4,other
6,2,S_2_0,actor-member
7,3,S_3_0,witness-casual
8,3,S_3_1,actor-leader
9,4,S_4_0,actor-lawyer


# Doc-Type per Source-Type

In [376]:
doctype_by_sourcetype = pd.DataFrame(labeled_sampler.sourcetype_by_doctype__source_counts, columns=roles)
doctype_by_sourcetype.head()

Unnamed: 0,government-decision-maker,government-spokesman,government-advisor,academic-other,actor-doctor,corporate-lawyer,corporate-spokesman,ngo-expert,academic-actor,actor-member,...,actor-leader,government-lawyer,witness-casual,actor-lawyer,actor-individual,corporate-decision-maker,victim-relative,government-witness,corporate-victim,other
0,0.0,0.0,0.0,3.0,2.0,250.0,287.0,0.0,0.0,277.0,...,1.0,0.0,1.0,1.0,3.0,238.0,0.0,270.0,244.0,0.0
1,94.0,93.0,0.0,0.0,88.0,1.0,92.0,1.0,97.0,0.0,...,75.0,99.0,1.0,0.0,1.0,0.0,82.0,78.0,95.0,0.0
2,147.0,0.0,153.0,151.0,1.0,0.0,0.0,0.0,0.0,168.0,...,1.0,0.0,174.0,149.0,165.0,153.0,0.0,155.0,0.0,0.0
3,0.0,0.0,0.0,159.0,0.0,167.0,149.0,0.0,0.0,0.0,...,145.0,169.0,173.0,156.0,0.0,2.0,0.0,2.0,0.0,140.0
4,125.0,101.0,120.0,114.0,1.0,0.0,0.0,111.0,0.0,124.0,...,137.0,1.0,0.0,1.0,122.0,89.0,1.0,0.0,1.0,0.0
