In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation

from tqdm import tqdm
import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher

from rake_nltk import Rake

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

df.shape

(23769, 6)

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens


In [5]:
nk_df = pd.read_csv('../data/df_nk.csv')

In [6]:
# # Find all doc contains "North Korea"
# nk_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
# nk_data_vectorized = nk_vectorizer.fit_transform(nk_df['body'])
# joblib.dump(nk_vectorizer, '../data/nk_vectorizer.csv')
# joblib.dump(nk_data_vectorized, '../data/nk_data_vectorized.csv')

['../data/nk_data_vectorized.csv']

In [7]:
nk_vectorizer = joblib.load('../data/nk_vectorizer.csv')
nk_data_vectorized = joblib.load('../data/nk_data_vectorized.csv')

In [8]:
# lda = LatentDirichletAllocation(n_components=50, random_state=0)
# lda.fit(nk_data_vectorized)
# joblib.dump(lda, '../data/nk_lda.csv')

['../data/nk_lda.csv']

In [9]:
lda = joblib.load('../data/nk_lda.csv')

In [10]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [11]:
print_top_words(lda, nk_vectorizer, n_top_words=25)


Topic #0: minister asean russia foreign china forum regional ri russian meeting talk wang asian say southeast putin attend north philippines yi security north korea moscow korean country

Topic #1: korean north say north korea missile south border complex official city government source factory kaesong ship seoul system range south korea report zone nuclear industrial artillery gps

Topic #2: india modi railway sinbo membership osjd trans university narendra student organization hyong choson jik unanimous pro bolster join yeo course member siberian attend shipyard interoperable

Topic #3: north korean complex south industrial say wage government firm worker kaesong park factory ministry company north korea seoul decision gaeseong joint pay inter official south korea month

Topic #4: hwang ahn prime kyo minister act poland kuwait national polish cabinet security al provocation president posture ensure court effort stress session grow european carry thornberry

Topic #5: visit world hir



In [12]:
'''
LDA 50 topics
Topic 1: Russia
Topic 7: NK
Topic 9: nuclear
Topic 10: People
Topic 23: Chinese
Topic 25: Olympic
Topic 28: reactor
Topic 32: nuclear
Topic 35: military
Topic 37: security
Topic 39: nuclear
Topic 40: bomb
Topic 41: Trump
Topic 44: economic
Topic 45: north and south relation
Topic 46: launching missile
Topic 47: launching rocket
Topic 48: Attack/ military
Topic 49: ?
'''

'\nLDA 50 topics\nTopic 1: nulear/ missile\nTopic 7: Security\nTopic 9: nuclear\nTopic 10: People\nTopic 23: Chinese\nTopic 25: Olympic\nTopic 28: reactor\nTopic 32: nuclear\nTopic 35: military\nTopic 37: security\nTopic 39: nuclear\nTopic 40: bomb\nTopic 41: Trump\nTopic 44: economic\nTopic 45: north and south relation\nTopic 46: launching missile\nTopic 47: launching rocket\nTopic 48: Attack/ military\nTopic 49: ?\n'

In [13]:
# nk_doc_topic_dist = pd.DataFrame(lda.transform(data_vectorized)).iloc[nk_doc_index]
# nk_doc_topic_dist.to_csv('../data/nk_doc_topic_dist.csv', index=False)

In [14]:
nk_doc_topic_dist = pd.read_csv('../data/nk_doc_topic_dist.csv')

In [15]:
def most_related_docs(topic_index, num_docs=5):
    sorted_doc = nk_doc_topic_dist.sort_values(by=['{}'.format(topic_index)], ascending=False)
    return nk_df.iloc[sorted_doc[:num_docs].index]

In [16]:
# NK soldier freedom
most_related_docs(1, num_docs=20)['title'].tolist()

['Video shows NK soldier making 50-meter dash for freedom',
 'North Korea troops violated armistice agreement while chasing defector: UNC',
 'N. Korean troops violated armistice in chasing defector: UNC',
 "NK apparently replaces all border security guards after soldier's defection: source",
 '[Newsmaker] Why did South Korea not return fire to NK chasing defected soldier?',
 'N. Korean soldier drove car before defecting to S. Korea: UNC',
 'NK fired 40 shots at fleeing soldier: military',
 'N.K. behind DMZ landmine blast: JCS',
 'N. Korea patrol ship sent back after crossing inter-Korean',
 'N. Korea deploys buoys near inter-Korean sea border',
 'Military found nearly 260 washed up N.K. landmines in past 6 years',
 'N. Korean soldiers briefly violate border with S. Korea',
 'N.K. seen doubling landmines in DMZ this year: S. Korean military',
 "UNC short of means to handle N. Korea's armistice violation",
 'UNC revises rules to allow machine guns, recoilless rifles, mortars into DMZ',
 

In [17]:
# Security
most_related_docs(7, num_docs=20)['title'].tolist()

['S. Korea repeals anti-cheating law',
 'Expats decry military for gay sex conviction',
 'Korean military court convicts soldier over gay sex',
 '[Herald Interview] The price of faith for conscientious objectors',
 '[Election 2017] Gender biased language backfires on campaign trail',
 'Most N. Koreans criticize government in private: CSIS survey',
 'N. Korea strengthens penalty for consuming S. Korean pop culture',
 'Female NK soldiers exposed to rape, abuse',
 'Drug trafficking hits major cities in North Korea: report',
 'North Korea gets 1 day off for Chuseok',
 'N. Korea uses women on TV to push forward anti-smoking campaign',
 'Public divided on whether to scrap use of ‘Korean age’',
 '[Video] Web of intrigue in assassination',
 'FM nominee’s silver hair draws fans',
 "North Korea is a bad trip if you're looking to get high",
 '[Weekender] Closer to the people',
 '[Weekender] Political satire under threat',
 'Feminism debate roils progressive party',
 '[Newsmaker] Lee Jung-mi: From

In [18]:
# Nuclear
most_related_docs(9, num_docs=20)['title'].tolist()

['Park quietly marks birthday amid probe, impeachment trial',
 '[News Analysis] Court centers on Park’s assistance of Choi',
 'Parliament seeks to question key figures in scandal at detention center',
 'Parliament fails to question key figures in scandal at detention center',
 'Lawmakers grill woman at heart of scandal at detention center',
 '[From the scene] Rallies sharpen their rhetoric as Park trial nears end',
 "Independent counsel creates special team to check Choi's wealth",
 '[Park ousted]  Park’s lawyers condemn ruling as ‘biased’',
 '[Newsmaker] Chief Justice steps down with ‘heavy heart’',
 "[Newsmaker] Is Constitutional Court stacked in Park's favor?",
 "NK says Park's ouster was 'judgment by history'",
 '[Focus] Constitutional Court stacked in favor of Park?',
 "Presidential office calls for 'calm' amid snowballing scandal",
 'Angry Park supporters call for repeal of impeachment',
 "S. Korea urges NK to stop meddling in Seoul's state affairs",
 '3rd protester dies as S. Ko

In [19]:
# Sex violence
most_related_docs(7, num_docs=20)['title'].tolist()

['S. Korea repeals anti-cheating law',
 'Expats decry military for gay sex conviction',
 'Korean military court convicts soldier over gay sex',
 '[Herald Interview] The price of faith for conscientious objectors',
 '[Election 2017] Gender biased language backfires on campaign trail',
 'Most N. Koreans criticize government in private: CSIS survey',
 'N. Korea strengthens penalty for consuming S. Korean pop culture',
 'Female NK soldiers exposed to rape, abuse',
 'Drug trafficking hits major cities in North Korea: report',
 'North Korea gets 1 day off for Chuseok',
 'N. Korea uses women on TV to push forward anti-smoking campaign',
 'Public divided on whether to scrap use of ‘Korean age’',
 '[Video] Web of intrigue in assassination',
 'FM nominee’s silver hair draws fans',
 "North Korea is a bad trip if you're looking to get high",
 '[Weekender] Closer to the people',
 '[Weekender] Political satire under threat',
 'Feminism debate roils progressive party',
 '[Newsmaker] Lee Jung-mi: From

In [20]:
# Olympics game
most_related_docs(12, num_docs=20)['title'].tolist()

["PyeongChang 'ready to welcome the world' at next Winter Olympics: IOC",
 'N. Korean IOC member keeps mum on Olympic co-hosting with S. Korea',
 'Moon says sports can create peace, invites N. Korea to PyeongChang Olympics',
 "'NK submitted document for PyeongChang Paralympics participation'",
 'North Korean IOC member says joint Korean team at PyeongChang 2018 may be difficult',
 'S. Korean lawmaker says time running out to form joint Korean squad for PyeongChang 2018',
 'Park inspects preparations for Summer Universiade',
 'Foreigners briefed on anti-terrorism measures for PyeongChang',
 '[PyeongChang 2018] IOC chief apparently seeks to visit NK over PyeongChang Olympics: sources',
 "S. Korea expects NK to accept Moon's offer for joint team for PyeongChang Olympics",
 "Unification ministry OKs N. Korean taekwondo demo team's visit to country",
 'N. Korean taekwondo athletes invited to Seoul',
 'PM Lee asks US Olympic chief for help in getting Trump to visit PyeongChang',
 'Nearly hal

In [21]:
# Weapon development
most_related_docs(41, num_docs=20)['title'].tolist()

['Civic groups demand U.S. apology for anthrax delivery',
 'USFK vows transparency in bio defense training in S. Korea',
 "U.S. calls for enhancing defense against 'very real' biological weapons threats from N. Korea",
 '‘USFK conducted 16 covert anthrax tests since 2009’',
 'S. Korea, U.S. agree to on-site probe of Osan base next week over anthrax shipment',
 "U.S. calls shipment of live anthrax sample 'inexcusable mistake'",
 "North Korea's pesticide institute capable of producing anthrax: U.S. expert",
 '2 S. Koreans indicted for protesting near U.S. Embassy',
 'U.S. institute claims discovery of secret N. Korean uranium facility',
 'NK renames agency handling weapons development: source',
 "Int'l study finds vast magma storage beneath Mt. Paektu",
 'Korea, U.S. form panel to investigate anthrax delivery',
 'Park says innovation is at the heart of survival strategy',
 'S. Korea, U.S. seek to enhance cybersecurity cooperation',
 'Presidential office admits importing anthrax vaccines,

In [22]:
# Disease
most_related_docs(46, num_docs=20)['title'].tolist()

['N. Korea claims to have developed panacea for MERS',
 'North Korea ends preventive steps against MERS virus',
 'NK soldier suffering from pneumonia and blood poisoning: report',
 'Number of malaria patients has dropped steadily in recent years: data',
 "Gov't to allow blood drives in malaria-prone areas to cope with low reserves",
 'South Korea steps up fight against TB',
 "N. Korea' 'free' medical service in deplorable state: sources",
 'N. Korea ramping up efforts to prevent MERS outbreak',
 'S. Korea to offer MERS detection devices to N. Korea',
 'S. Korea on highest alert for foot-and-mouth outbreak',
 'Ebola medics recount mission on edge of death',
 'Seoul to provide quarantine gear to North Korea',
 'No. of malaria-infected N. Koreans drops for 4th consecutive yr in 2016: report',
 'N. Korea demands action against possible MERS cases in Kaesong zone',
 'S. Korea to offer MERS detection devices to N. Korea next week',
 '[Herald Interview] Agency calls for health care support fo