In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation

from tqdm import tqdm

import spacy
from spacy.matcher import Matcher

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

df['agg_title_body'] = df['title'] + df['body']

df.shape

(23769, 7)

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens


In [5]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [6]:
def most_related_docs(df, doc_topic_dist, topic_index, num_docs=5):
    sorted_doc = doc_topic_dist.sort_values(by=['{}'.format(topic_index)], ascending=False)
    return df.iloc[sorted_doc[:num_docs].index]

In [7]:
df_nk = df[df['section'] == 'North Korea']

In [8]:
# Find all doc contains "North Korea"
nk_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
nk_data_vectorized = nk_vectorizer.fit_transform(df_nk['agg_title_body'])
joblib.dump(nk_vectorizer, '../data/nk_vectorizer.csv')
joblib.dump(nk_data_vectorized, '../data/nk_data_vectorized.csv')

['../data/nk_data_vectorized.csv']

In [9]:
nk_vectorizer = joblib.load('../data/nk_vectorizer.csv')
nk_data_vectorized = joblib.load('../data/nk_data_vectorized.csv')

In [10]:
nk_lda_components = 50
nk_lda = LatentDirichletAllocation(n_components=nk_lda_components, random_state=0)
nk_lda.fit(nk_data_vectorized)
joblib.dump(nk_lda, '../data/nk_{}lda.csv'.format(nk_lda_components))

['../data/nk_50lda.csv']

In [11]:
nk_lda = joblib.load('../data/nk_{}lda.csv'.format(nk_lda_components))

In [12]:
print_top_words(nk_lda, nk_vectorizer, n_top_words=25)


Topic #0: north korean say defector south koreans south korea north korea defect china ministry source defection restaurant worker seoul border official government number send yonhap country korea accord

Topic #1: north korean china say export trade north korea report import source coal oil year $ korea product chinese accord sanction country soldier yonhap pyongyang currency un

Topic #2: ebola tour tourist virus spread marathon disease traffic travel pyongyang outbreak child mers korea health middle respiratory month lift country deadly north report say park

Topic #3: sea ship korean border water island south vessel boat line near maritime north korea fishing rescue cross yellow patrol say northern guard ministry military east

Topic #4: south korea north korea military uganda large artillery risk say museveni propel antiaircraft fire sign missile time ugandan country chief tel year un north slow gap korea

Topic #5: lee north kim visit korean trip leader jung meet dae jong lady c



In [13]:
nk_doc_topic_dist = pd.DataFrame(nk_lda.transform(nk_data_vectorized))
nk_doc_topic_dist.to_csv('../data/nk_doc_topic_dist.csv', index=False)

In [14]:
nk_doc_topic_dist = pd.read_csv('../data/nk_doc_topic_dist.csv')

In [15]:
'''
LDA NK without missile 50 topics
Topic 1: Russia
Topic 7: NK
Topic 9: nuclear
Topic 10: People
Topic 23: Chinese
Topic 25: Olympic
Topic 28: reactor
Topic 32: nuclear
Topic 35: military
Topic 37: security
Topic 39: nuclear
Topic 40: bomb
Topic 41: Trump
Topic 44: economic
Topic 45: north and south relation
Topic 46: launching missile
Topic 47: launching rocket
Topic 48: Attack/ military
Topic 49: ?
'''

In [16]:
# NK soldier freedom
most_related_docs(df_nk, nk_doc_topic_dist, 1, num_docs=20)['title'].tolist()

['N. Korean smugglers making money through illegal pork exports to China: report',
 'N. Korean traders going all out to receive permits: report',
 'N. Korea stops rare earth metal exports to China',
 'China toughens checks on smuggling with N. Korea: sources',
 'China stopped refined oil exports to N. Korea in Oct: report',
 "Exports of mineral resources backs NK's economy: report",
 "Kim Jong-un's drive for domestic products likely to fall apart: report",
 'China did not import coal, other banned items from N. Korea: VOA',
 "Russia's oil shipments to NK on sharp increase",
 'Taiwan cuts off trade with NK',
 'No. of malaria-infected N. Koreans drops for 4th consecutive yr in 2016: report',
 'China notifies UN of imports of coal from NK',
 "N. Korea's coal exports to China hit record high in Aug.",
 'China imposes limit on oil supply to North Korea',
 "China's Shandong Prov. biggest importer of NK's coal",
 'Oil prices in Pyongyang stable despite sanctions: diplomat',
 'North Korean sol

In [17]:
# Security
most_related_docs(df_nk, nk_doc_topic_dist, 7, num_docs=20)['title'].tolist()

['NK renames youth association as it seeks to beef up internal solidarity',
 'N.K. set to complete 70-day loyalty drive ahead of party congress',
 'N.K. may hold party congress in May without foreign guests: source',
 'N. Korea to hold first party congress in over three decades on May 6',
 'Pyongyang kicks off preparations for May party congress',
 'N.K. to launch another work-hard campaign in June: Seoul',
 'N.K. opens first youth congress in 23 years',
 'NK opens new residential area in Pyongyang: report',
 'N.K. uses party congress to declare nukes its priority: report',
 "N. Korea's party adopts decision on nukes at key congress",
 'N. Korea likely to convene ruling party congress on May 5: source',
 'NK announces successful conclusion of 200-day work-hard drive',
 'N. Korea opens first party congress in 36 years',
 'N.K. to hold major parliamentary meeting in late June',
 'N. Korea set to open first party congress in 36 years',
 'With new town project, NK seeks internal solidarity

In [18]:
# Nuclear
most_related_docs(df_nk, nk_doc_topic_dist, 9, num_docs=20)['title'].tolist()

["Unification minister asks for Russia's cooperation in resolving NK nukes",
 'THAAD not helpful to inter-Korean relations: Russian envoy',
 'North Korea demand for peace treaty worth taking note: Russian envoy',
 'Kim Jong-un to visit Moscow: Russian envoy',
 "S. Korea in diplomatic consultations over N. Korea's nuke test",
 'North Korea releases Russian yacht and its crew',
 "South Korea, U.S. denounce N.K. test as 'grave provocation'",
 "N.K. nuke 'exit strategy' may be needed",
 'Hungry N. Korean soldiers committing various crimes: report',
 'Bulgaria reaffirms support for denuclearization, vows to fully implement N. Korea sanctions',
 'Chronology of North Korea’s nuclear, missile programs',
 "[Breaking] NK's missile flew about 500 kilometers: S. Korea's JCS",
 "[Breaking] S. Korean President Moon convenes NSC meeting on NK's missile launch",
 'Moon orders meeting of national security council over N. Korean missile launch',
 '[Graphic News] North Korea’s exported labor',
 "South Ko

In [19]:
# Olympics game
most_related_docs(df_nk, nk_doc_topic_dist,12, num_docs=20)['title'].tolist()

['Seoul welcomes adoption of new UNSC sanctions resolution on Pyongyang',
 'S. Korea mulls ‘diverse’ diplomatic efforts to intensify pressure on NK',
 'Mandate of U.N. panel of experts on N.K. sanctions extended until next year',
 'UN Council unanimously condemns North Korea missile test',
 "'Strongest' U.N. sanctions agreed, but implementation still challenging",
 'Japan announces fresh N. Korea sanctions after rocket launch',
 'South Korea welcomes new U.S. sanctions on North Korea',
 'S. Korea urges N. Korea to abandon nuclear ‘delusion’',
 "N. Korea condemns S. Korea's unilateral sanctions",
 'European nations welcome new U.N. sanctions on N. Korea',
 'Kuwait submits implementation report on Resolution 2371',
 'UN Security Council condemns NK nuclear test, vows to begin work immediately on sanctions resolution',
 'UNSC discloses additional items banned from entering N.K.',
 'EU says it is mulling tougher sanctions against NK',
 'U.N. sanctions should be beyond N. Korea’s imaginatio

In [20]:
# Sex violence
most_related_docs(df_nk, nk_doc_topic_dist, 7, num_docs=20)['title'].tolist()

['NK renames youth association as it seeks to beef up internal solidarity',
 'N.K. set to complete 70-day loyalty drive ahead of party congress',
 'N.K. may hold party congress in May without foreign guests: source',
 'N. Korea to hold first party congress in over three decades on May 6',
 'Pyongyang kicks off preparations for May party congress',
 'N.K. to launch another work-hard campaign in June: Seoul',
 'N.K. opens first youth congress in 23 years',
 'NK opens new residential area in Pyongyang: report',
 'N.K. uses party congress to declare nukes its priority: report',
 "N. Korea's party adopts decision on nukes at key congress",
 'N. Korea likely to convene ruling party congress on May 5: source',
 'NK announces successful conclusion of 200-day work-hard drive',
 'N. Korea opens first party congress in 36 years',
 'N.K. to hold major parliamentary meeting in late June',
 'N. Korea set to open first party congress in 36 years',
 'With new town project, NK seeks internal solidarity

In [21]:
# Weapon development
most_related_docs(df_nk, nk_doc_topic_dist, 41, num_docs=20)['title'].tolist()

['Korea reviewing support for loss-ridden investors in inter-Korean projects',
 'Korean companies seek to check equipment in Kaesong complex',
 'Two Koreas to hold talks to negotiate Gaeseong wage issues',
 'North Korea kicks out South Koreans, freezes assets in Gaeseong',
 'Businessmen call for probe into shutdown of factory park in N. Korea',
 'North Korea freezes Gaeseong assets, expels South Korean workers',
 'Korean firms call for full support after factory shutdown',
 'Korea rolls out support measures for companies forced out of Gaeseong',
 'Koreas set to hold talks on joint industrial park amid wage row',
 'Korea to approve further compensation for Kaesong firms soon',
 'Kaesong firms pressure Seoul to approve N.K. visit',
 'Koreas divided over land use fee at joint industrial park',
 'N. Korea hints operation of inter-Korean industrial complex',
 "NK's resumption of Kaesong complex violates property rights: Seoul",
 "Gaeseong firms pressure gov't to approve N.K. visit",
 'Korea

In [22]:
# Disease
most_related_docs(df_nk, nk_doc_topic_dist, 46, num_docs=20)['title'].tolist()

['[News Focus] Why did North Korea fail to detect US bombers?',
 'Lotte Group approves land swap deal for THAAD battery',
 'N. Korea releases dam water without prior notice',
 'Seoul scores success in test of countermissile system',
 'N. Korean drone spied on THAAD site: military',
 'Seoul buys more Taurus missiles amid Pyongyang nuke threats',
 'N. Korea takes no immediate action in response to US bomber flight: spy agency',
 '[Newsmaker] Loudspeakers mightier than the sword',
 'N. Korea forward deploys amphibious landing crafts carrying special forces',
 'Japan carries out anti-missile drill following NK missile launch',
 'South Korea allocates budget for ‘decapitation unit’',
 'South Korea divided over THAAD',
 'Drone that spied on THAAD was from N. Korea: military',
 'N. Korea increases drone sorties along western border',
 'NK drone presumed to have spied on THAAD angle: lawmaker',
 'Wounded NK soldier transferred to general ward from ICU',
 'N. Korea discharges dam water near bor

In [23]:
df_nk_missile = df[(df['section'] == 'North Korea') & (df['agg_title_body'].str.contains('issile', flags=re.IGNORECASE, regex=True))]

In [24]:
# Find all doc contains "North Korea" with Missile
nk_missile_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
nk_missile_data_vectorized = nk_missile_vectorizer.fit_transform(df_nk_missile['agg_title_body'])
joblib.dump(nk_vectorizer, '../data/nk_missile_vectorizer.csv')
joblib.dump(nk_data_vectorized, '../data/nk_missile_data_vectorized.csv')

['../data/nk_missile_data_vectorized.csv']

In [25]:
nk_missile_vectorizer = joblib.load('../data/nk_missile_vectorizer.csv')
nk_missile_data_vectorized = joblib.load('../data/nk_missile_data_vectorized.csv')

In [26]:
nk_missile_lda_components = 25
nk_missile_lda = LatentDirichletAllocation(n_components=25, random_state=0)
nk_missile_lda.fit(nk_missile_data_vectorized)
joblib.dump(nk_missile_lda, '../data/nk_missile_{}lda.csv'.format(nk_missile_lda_components))

['../data/nk_missile_25lda.csv']

In [27]:
nk_missile_lda = joblib.load('../data/nk_missile_{}lda.csv'.format(nk_missile_lda_components))

In [28]:
print_top_words(nk_missile_lda, nk_missile_vectorizer, n_top_words=25)


Topic #0: korean north south say north korea ministry government complex koreans seoul worker south korea defector industrial border korea official park unification firm kaesong wage factory yonhap china

Topic #1: north north korea report year percent say china korean trade country $ korea export source sanction yonhap accord chinese import economic oil pyongyang coal show international

Topic #2: winter olympics flood pyeongchang game olympic north korea say damage sport people ebola virus participation spread internet red games event report hit ioc disease rain miss

Topic #3: sea island boat fishing border line northern maritime yellow facto de vessel limit patrol nll near yeonpyeong western conservative say plan draw korea guard water

Topic #4: military korean say drill north force exercise korea war joint south korea strike north korea air attack provocation fire south peninsula threat washington missile nuclear warn news

Topic #5: north lee kim korean visit number say summit 

In [30]:
nk_missile_doc_topic_dist = pd.DataFrame(nk_missile_lda.transform(nk_missile_data_vectorized))
nk_missile_doc_topic_dist.to_csv('../data/nk_missile_doc_topic_dist.csv', index=False)

In [31]:
nk_missile_doc_topic_dist = pd.read_csv('../data/nk_missile_doc_topic_dist.csv')

In [32]:
#### NK with missile 25 topics
'''
LDA NK with missile 50 topics
Topic 0: Olympics
Topic 1: 
Topic 2: launching missile/rocket
'''

'''
LDA NK with missile 25 topics
Topic 0: Olympics
Topic 1: 
Topic 2: launching missile/rocket
'''


'\nLDA NK with missile 25 topics\nTopic 0: Olympics\nTopic 1: \nTopic 2: launching missile/rocket\n'

In [33]:
# Olympics
most_related_docs(0, num_docs=20)['title'].tolist()

TypeError: most_related_docs() missing 2 required positional arguments: 'doc_topic_dist' and 'topic_index'