In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation

from tqdm import tqdm

import spacy
from spacy.matcher import Matcher

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Modern\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Modern\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
pd.set_option('max_colwidth',200)
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

df['agg_title_body'] = df['title'] + df['body']

df.shape

(23769, 7)

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens


In [5]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [6]:
def most_related_docs(df, doc_topic_dist, topic_index, num_docs=5):
    sorted_doc = doc_topic_dist.sort_values(by=['{}'.format(topic_index)], ascending=False)
    return df.iloc[sorted_doc[:num_docs].index]

In [7]:
df_nk = df[df['section'] == 'North Korea']

In [8]:
# # Find all doc contains "North Korea"
# nk_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
# nk_data_vectorized = nk_vectorizer.fit_transform(df_nk['agg_title_body'])
# joblib.dump(nk_vectorizer, '../data/nk_vectorizer.csv')
# joblib.dump(nk_data_vectorized, '../data/nk_data_vectorized.csv')

In [9]:
nk_vectorizer = joblib.load('../data/nk_vectorizer.csv')
nk_data_vectorized = joblib.load('../data/nk_data_vectorized.csv')

In [10]:
nk_lda_components = 50
# nk_lda = LatentDirichletAllocation(n_components=nk_lda_components, random_state=0)
# nk_lda.fit(nk_data_vectorized)
# joblib.dump(nk_lda, '../data/nk_{}lda.csv'.format(nk_lda_components))

In [11]:
nk_lda = joblib.load('../data/nk_{}lda.csv'.format(nk_lda_components))

In [12]:
print_top_words(nk_lda, nk_vectorizer, n_top_words=25)


Topic #0: tour tourist north travel foreign tourism zone north korea pyongyang border visit say resort ebola korea city year agency ban chinese country wonsan report virus international

Topic #1: missile launch north ballistic test say range north korea rocket korea fire long security council korean nuclear u.n provocation satellite pyongyang military resolution late international japan

Topic #2: north wage korean south firm industrial gaeseong worker complex say park pay government joint ministry seoul unilateral cash $ set month committee korea percent inter

Topic #3: right human north u.n north korea say resolution call international korean people korea un situation violation abuse country report council year issue regime assembly hold political

Topic #4: say korean north korea war resolution north shin d ri report country lawmaker book committee flight airspace intelligence bomber house force merkel royce veteran address pyongyang

Topic #5: right human north korean office say

In [13]:
# nk_doc_topic_dist = pd.DataFrame(nk_lda.transform(nk_data_vectorized))
# nk_doc_topic_dist.to_csv('../data/nk_doc_topic_dist.csv', index=False)

In [14]:
nk_doc_topic_dist = pd.read_csv('../data/nk_doc_topic_dist.csv')

In [32]:
# Tour
most_related_docs(df_nk, nk_doc_topic_dist, 0, num_docs=20)['title']

22226                 Chinese tours to N. Korea back on track after Ebola ban lifted
16605                First batch of Chinese tourists visit new tour zone in N. Korea
14315                    Foreigners continue to visit N.K. despite sanctions: report
22435        Tourists to arrive in N. Korea next week after lifting Ebola travel ban
17385                     Pyongyang to launch 'large-scale' ski tours for foreigners
22571                     N. Korea may lift Ebola restrictions on April 1: tour firm
23443                         N. Korea seen to lift Ebola travel ban soon: tour firm
22493                             N. Korea set to lift Ebola restrictions: tour firm
22470                                        N. Korea ends Ebola travel restrictions
22641                N. Korea bans foreigners from marathon race over Ebola concerns
16754        N. Korea rebukes Canada for taking issue with its ruling against pastor
6448                               Chinese tour operators halt tr

In [33]:
# Human right
most_related_docs(df_nk, nk_doc_topic_dist, 5, num_docs=20)['title']

14090         Time names N.K. leader among 100 influential people, calls him 'exploiter of fears'
23564                                                   N. Korea has about 2,500 armored vehicles
599                                   US weekly Time shortlists Kim, Trump for Person of the Year
8045                                                                    The murder as it happened
23239                                          N. Korea all out to discredit rights abuse charges
12545         Aid supply to N.K. should be 'quid pro quo' for human rights improvement: professor
22180                                UN expert demands international action on N.Korea abductions
22855                          S. Korean watchdog to hold forum on N.K. human rights in Indonesia
18742                                 N. Korea's all-female music band disappears from broadcasts
6500                                   S. Korea seeks to set up museum on NK human rights by 2019
11698               

In [35]:
# Reactor
most_related_docs(df_nk, nk_doc_topic_dist, 8, num_docs=20)['title']

12948                          N. Korea has begun nuclear reprocessing for plutonium production: report
12920               U.S. views N.K. with 'great concern' amid reports of reprocessing: State Department
13193                                 N. Korea continues to show signs of nuclear reprocessing activity
8542                                      Satellite image shows NK restarts plutonium-producing reactor
13100                                N. Korea showing possible signs of reprocessing activity: 38 North
16324                N.K. nuclear reactor running intermittently, uranium enrichment plant in operation
14506         N. Korea could have already started harvesting plutonium from spent fuel: U.S. think tank
12973                                            IAEA: N. Korea could have started nuclear reprocessing
14241                 Satellite imagery shows strong signs of N. Korea's nuclear reprocessing: 38 North
12803           N. Korea believed to have produced additional 4-

In [36]:
# Olympics game
most_related_docs(df_nk, nk_doc_topic_dist, 41, num_docs=20)['title']

19477                     U.S. puts N. Korea among world's worst countries for human trafficking
12426                  U.S. puts North Korea among world's worst countries for human trafficking
4456                     US labels N. Korea one of world's worst countries for human trafficking
17239                              Hyundai, Kia likely to see SUV sales exceed 100,000 in Europe
17973                                   Obama blacklists North Korea again for human trafficking
10474                                   Obama blacklists North Korea again for human trafficking
17241                            Park's memoir tops bestseller chart in China's online bookstore
11607                        North Korean army sergeant kills company commander's family: report
4379                                                     NK condemns US human trafficking report
16517                                                         N.K. quake magnitude raised to 4.8
7639                          

In [31]:
# Olympics game
most_related_docs(df_nk, nk_doc_topic_dist, 43, num_docs=20)['title']

6146                                            Trump: Xi should be rewarded for lowering tensions with NK
6143     US experts call for 'secondary sanctions' on China to get Beijing to exercise real pressure on NK
17003                                      Beijing vows to improve relations with Pyongyang to 'new level'
16068                                                           China won't take N. Korea off life support
4249                                            Trump questions sincerity of China's efforts to rein in NK
6109                                        McCain: China should suffer 'penalties' if it won't rein in NK
5886                                    China's handling of NK 'litmus test' of US-China relations: Snyder
5229                                             Trump says NK missile launch 'great disrespect' for China
16879                                              China still mum on N. Korean band's abrupt cancellation
22386                                

In [34]:
# Game, attack
most_related_docs(df_nk, nk_doc_topic_dist, 29, num_docs=20)['title']

3558                                                          North Korea launches own “FIFA” PC game
11779                                              N. Korea behind latest online mall hacking: police
11901                                       Gov't bolsters countermeasures to deter N.K. cyberattacks
11900                                       Gov't bolsters countermeasures to deter N.K. cyberattacks
12173                                      More N.Korean defectors’ food trucks coming to Busan, Jeju
502      [PyeongChang 2018] IOC chief apparently seeks to visit NK over PyeongChang Olympics: sources
402                                                NK defector soldier given free choco pies for life
12                                                Number of N. Koreans seeking asylum in Europe drops
662          [Newsmaker] NK’s PyeongChang participation looks bleak with figure-skating deadline miss
482                                  [PyeongChang 2018] IOC chief seeking to visit

In [22]:
df_nk_missile = df[(df['section'] == 'North Korea') & (df['agg_title_body'].str.contains('issile', flags=re.IGNORECASE, regex=True))]

In [23]:
# # Find all doc contains "North Korea" with Missile
# nk_missile_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
# nk_missile_data_vectorized = nk_missile_vectorizer.fit_transform(df_nk_missile['agg_title_body'])
# joblib.dump(nk_missile_vectorizer, '../data/nk_missile_vectorizer.csv')
# joblib.dump(nk_missile_data_vectorized, '../data/nk_missile_data_vectorized.csv')

In [24]:
nk_missile_vectorizer = joblib.load('../data/nk_missile_vectorizer.csv')
nk_missile_data_vectorized = joblib.load('../data/nk_missile_data_vectorized.csv')

In [25]:
nk_missile_lda_components = 25
# nk_missile_lda = LatentDirichletAllocation(n_components=25, random_state=0)
# nk_missile_lda.fit(nk_missile_data_vectorized)
# joblib.dump(nk_missile_lda, '../data/nk_missile_{}lda.csv'.format(nk_missile_lda_components))

In [26]:
nk_missile_lda = joblib.load('../data/nk_missile_{}lda.csv'.format(nk_missile_lda_components))

In [27]:
print_top_words(nk_missile_lda, nk_missile_vectorizer, n_top_words=25)


Topic #0: trump north korea president say moon donald missile leader nuclear korean house north un administration pressure china threat security regime sanction white wa pyongyang cheong military

Topic #1: sanction north north korea resolution korean say country council export nuclear ban china foreign pyongyang security source un include trade new missile oil $ bank report

Topic #2: nuclear north weapon say bomb plutonium test produce institute develop uranium reactor expert north korea missile program u.s material kilogram report facility fuel pyongyang believe year

Topic #3: number broadcast page cell order chairperson agent socialist encrypt radio executive book broadcasting announcer spy day resume party conference operate address kind north korea mysterious message


Topic #5: north council un say kim resolution korean security north korea jong member dprk sanction state u.n international official pyongyang nuclear statement human right report missile unsc

Topic #6: nuclear 

In [28]:
# nk_missile_doc_topic_dist = pd.DataFrame(nk_missile_lda.transform(nk_missile_data_vectorized))
# nk_missile_doc_topic_dist.to_csv('../data/nk_missile_doc_topic_dist.csv', index=False)

In [29]:
nk_missile_doc_topic_dist = pd.read_csv('../data/nk_missile_doc_topic_dist.csv')

In [39]:
# Defense missile
most_related_docs(df_nk_missile, nk_missile_doc_topic_dist, 17, num_docs=20)['title']

12754       U.S. think tank report lays out military strike scenario against North Korea
10567                        Seoul buys more Taurus missiles amid Pyongyang nuke threats
13014         S. Korea has undoubtedly clear intention to deploy THAAD: defense minister
3676           S. Korea to upgrade Patriot missiles against N. Korea's threats: minister
20650                   THAAD is 'excellent deterrent' against N. Korean threats: McCain
12213                                THAAD can intercept North's SLBMs: defense minister
983                South Korea refuses to share military intelligence with Japan: report
16913                          Seoul to beef up anti-terrorism posture against Pyongyang
8870                    CIA chief nominee picks N. Korea as one of biggest threats to US
2821           US vows close consultation with S. Korea over military options against NK
6071           PACOM commander: It's matter of time before NK perfects ICBM capabilities
1957                 

In [None]:
'''
22641                N. Korea bans foreigners from marathon race over Ebola concerns
18742                                 N. Korea's all-female music band disappears from broadcasts
9502                    Suspected NK attackers hack into S. Korea's cyber command through main server
19477                     U.S. puts N. Korea among world's worst countries for human trafficking
13014         S. Korea has undoubtedly clear intention to deploy THAAD: defense minister
'''

In [40]:
related_issue_index = [22641, 18742, 9502, 19477, 13014]

In [41]:
# Extract entities from sentences
def extract_entity(sents, index, df):
    print("[ Detailed Information (per event) ]\n")
    
    ent_per = []
    ent_org = []
    ent_loc = []
    
    event = df.iloc[index].sort_values(by=['time'])['title'].tolist()
    
    for i in reversed(range(len(sents))):
        doc = nlp(sents[i])
        for ent in doc.ents:
            word = ent.text.title()
            if word in ent_per or word in ent_org or word in ent_loc:
                continue
            if ent.label_ == 'PERSON':
                ent_per.append(word)
            elif ent.label_ == 'ORG':
                ent_org.append(word)
            elif ent.label_ in ['GPE', 'LOC']:
                ent_loc.append(word)
        
        print("Event: ", event[i])
        print("- Person: ", ", ".join([i for i in ent_per]))
        print("- Organization: ", ", ".join([i for i in ent_org]))
        print("- Place: ", ", ".join([i for i in ent_loc]))
        print()

In [46]:
def print_events(df, index):
    event = df.iloc[index].sort_values(by=['time'])['title'].tolist()
    print("[ Related-Issue Events ]\n")
    print(", ".join([i for i in event]))
    print()

In [47]:
sents_nk = [ ' '.join(spacy_tokenizer(s)) for s in df.iloc[related_issue_index].sort_values(by=['time'])['body']]
print_events(df, related_issue_index)
extract_entity(sents_nk, related_issue_index, df)

[ Related-Issue Events ]

N. Korea bans foreigners from marathon race over Ebola concerns, U.S. puts N. Korea among world's worst countries for human trafficking, N. Korea's all-female music band disappears from broadcasts, S. Korea has undoubtedly clear intention to deploy THAAD: defense minister, Suspected NK attackers hack into S. Korea's cyber command through main server

[ Detailed Information (per event) ]

Event:  Suspected NK attackers hack into S. Korea's cyber command through main server
- Person:  Han Min Koo, Kim Jin Pyo, Kim
- Organization:  National Defense Committee
- Place:  South Korea, China, North Korea, The United States

Event:  S. Korea has undoubtedly clear intention to deploy THAAD: defense minister
- Person:  Han Min Koo, Kim Jin Pyo, Kim
- Organization:  National Defense Committee
- Place:  South Korea, China, North Korea, The United States, U.S., Singapore, Korea

Event:  N. Korea's all-female music band disappears from broadcasts
- Person:  Han Min Koo, Kim 