In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation

from tqdm import tqdm
import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher

from rake_nltk import Rake

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

df.shape

(23769, 6)

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens

In [5]:
df_park = pd.read_csv('../data/df_park.csv')

In [6]:
# pp_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
# pp_data_vectorized = pp_vectorizer.fit_transform(df_park['body'])
# joblib.dump(pp_vectorizer, '../data/pp_vectorizer.csv')
# joblib.dump(pp_data_vectorized, '../data/pp_data_vectorized.csv')

In [7]:
pp_vectorizer = joblib.load('../data/pp_vectorizer.csv')
pp_data_vectorized = joblib.load('../data/pp_data_vectorized.csv')

In [8]:
# lda = LatentDirichletAllocation(n_components=50, random_state=0)
# lda.fit(pp_data_vectorized)
# joblib.dump(lda, '../data/pp_lda.csv')

In [9]:
lda = joblib.load('../data/pp_lda.csv')

In [10]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [11]:
print_top_words(lda, pp_vectorizer, n_top_words=25)


Topic #0: iran tehran iranian sanction rouhani hassan islamic business mous middle follow visit lifting $ large bilateral infrastructure lift historic trip nuclear international ministry tap mou

Topic #1: park korean south korea system deployment south missile say thaad defense government plan mexico terminal area hye altitude yonhap geun battery country president people korea high

Topic #2: choi park president say scandal foundation soon presidential chung sil allegation state suspect influence daughter company sport office affair university k local close official jeong

Topic #3: hwang minister prime president act kyo ahn say meeting government cabinet day national corruption yonhap public hye geun state park people take justice new affair

Topic #4: car park cable mountain civil government pension project city local service route say visitor group national call loom county year environment pay yangyang habitat animal

Topic #5: earthquake say pyeongchang winter safety quake olymp



In [12]:
'''
LDA 50 topics
Topic 0: Iran
Topic 1: development
Topic 2: ?
Topic 3: ?
Topic 4: city?
Topic 5: disaster
Topic 6: -
Topic 7: Between country
Topic 8: 
Topic 9: education
Topic 10: economic
Topic 11: residence
Topic 12: sewol
Topic 16: president
Topic 19: MERS
Topic 21: president park
Topic 26: Party
Topic 27: nuclear
Topic 30: Electronics
Topic 34: Unification
Topic 36: mongolia
Topic 48: North Korea
'''

'\nLDA 50 topics\nTopic 0: Tragedy\nTopic 1: Soldier\nTopic 2: nuclear / missile\nTopic 3: ?\nTopic 4: ?\nTopic 5: war \nTopic 6: Travel?\nTopic 7: Sex violence\nTopic 8: ?\nTopic 9: ?\nTopic 10: ?\nTopic 11: ?\nTopic 12: Olympics game\nTopic 13: ?\nTopic 14: Army\nTopic 15: Dokdo\nTopic 16: \nTopic 17: Moon and Park\nTopic 33: missile nuclear\nTopic 34: airplane\nTopic 35: President\nTopic 36: misile nuclear\nTopic 41: science technology\nTopic 46: disease\nTopic 47: blood donate\nTopic 48: Japanese and sex victim\nTopic 49: Worker, job market\n'

In [13]:
# pp_doc_topic_dist = pd.DataFrame(lda.transform(pp_data_vectorized))
# pp_doc_topic_dist.to_csv('../data/pp_doc_topic_dist.csv', index=False)

In [14]:
pp_doc_topic_dist = pd.read_csv('../data/pp_doc_topic_dist.csv')

In [16]:
def most_related_docs(topic_index, num_docs=5):
    sorted_doc = pp_doc_topic_dist.sort_values(by=['{}'.format(topic_index)], ascending=False)
    return df_park.iloc[sorted_doc[:num_docs].index]

In [23]:
# Iran
most_related_docs(0, num_docs=20)['title'].tolist()

['Park to make historic visit to Iran next month',
 'Korea, Mongolia agree to seek increase in flights, bolster development cooperation',
 'Park to meet with Rouhani, supreme leader',
 'Seoul, Iran officials to meet over economy, Pyongyang',
 'Park mulling visiting Iran',
 'Park leaves for Iran for talks with Rouhani',
 'Korea vows to support Iran in development of capital markets',
 'President Park to depart for Iran to seek opportunities',
 'Park arrives in Korea after groundbreaking trip to Iran',
 'Park vows efforts to narrow global education gap',
 'Korea-Iran summit paves way for $45.6b business deals',
 'In diplomatic flurry, Park considers Iran visit',
 'Ambassador says Korea revs up efforts to restore ties with Iran',
 'In diplomatic flurry, Park mulls Iran visit',
 'Korea, Iran to boost IT, science cooperation',
 'Park’s visit to Iran landmark in bilateral relations',
 "Park: S. Korea, Iran can work together toward N. Korea's denuclearization",
 'Korea, Iran to discuss launch

In [22]:
# Development
most_related_docs(1, num_docs=20)['title'].tolist()

['Park phones chief of S. Korean research station in Antarctica',
 'Seoul pushing to convey DMZ park plan to North',
 'Seoul pushing to convey DMZ park plan to N. Korea',
 'Park in Brazil on last leg of S. America tour',
 'Park to meet ruling party lawmakers for talks over THAAD',
 'Residents of Gimcheon protest THAAD deployment decision',
 'Park pays tribute to Mexican heroes ahead of talks with Pena Nieto',
 'Park renews calls for peaceful unification with North Korea',
 'Park calls for cooperation with China-led AIIB',
 'Opposition leader demands parliamentary approval of THAAD battery',
 'Park renews call for national unity in face of N.K. security threats',
 "Mongolian version of Park's autobiography republished",
 'S. Korean election result unlikely to affect THAAD talks: U.S. official',
 'Park leaves for home after four-nation tour of S. America',
 'Seoul should suspend THAAD until Beijing halts retaliation: scholar',
 'Park steps up efforts to curb THAAD criticism',
 '[JEJU FOR

In [21]:
# MERS
most_related_docs(19, num_docs=20)['title'].tolist()

['Overseas voters required to sign up for May 9 election in a week',
 'After MERS, Korea to beef up health expertise',
 'Ministry, Samsung Medical Center playing blame game on MERS outbreak',
 "Election watchdog asks Facebook to help prevent 'fake' news",
 'Final turnout to reach 80%: election watchdog',
 'MERS outbreak ‘practically over’',
 '[Newsmaker] Korea’s MERS concerns more about politics than health',
 '[Weekender] 4,000 kilometers of business and pleasure',
 'Park invites foreign medical experts after MERS outbreak',
 'Anti-bandwagon election rule leaves early voters exposed',
 'Parliament approves health minister nominee',
 'Park tours hospital with MERS patients',
 'Park meets WHO chief over MERS outbreak',
 'Park meets MERS task force officials',
 'Quarantine team sent to Samsung hospital',
 'Park says utmost efforts under way to contain MERS',
 'Korea aims to draw 400,000 foreign medical patients',
 'Record overseas voters register for presidential election',
 'Park to con

In [24]:
# Disaster
most_related_docs(5, num_docs=20)['title'].tolist() 

['Govt designates quake-hit Gyeongju as special disaster zone',
 'Aftershocks expected to continue: KMA',
 'Switzerland marks national day with classical music',
 'Park calls for review to designate quake-hit Gyeongju as special disaster zone',
 'THAAD row not affecting PyeongChang-Beijing Winter Games',
 'Quake triggers safety concerns, halts facilities',
 'Park embraces Muslim culture to befriend Iran',
 "Samsung's leadership vaccum won't affect PyeongChang's Olympic preps: top organizer",
 'Colombia celebrates peace and prosperity at independence anniversary',
 'Park calls for thorough preparation for 2018 Winter Games',
 "Preparations for PyeongChang Olympics will go on despite Park's impeachment: organizers",
 'Magnitude-5.8 earthquake jolts southeastern S. Korea',
 'Acting President Hwang vows to put top priority on public safety',
 'Colombia, Korea boost cooperation for FTA',
 "Parties slam government's earthquake countermeasures",
 'Park sends condolences for Nepal quake deaths

In [25]:
# Sewol
most_related_docs(12, num_docs=20)['title'].tolist()

['[From the Scene] Koreans pray for recovery of bodies',
 'Sewol recovery leaves unique trail for Chinese veteran diver',
 'SNUH accused of delaying analysis of farmer’s death',
 'Activist farmer died due to external cause: SNHU',
 '[From the Scene] 3 years on, still no closure on Sewol tragedy',
 'War hero at the heart of Korea-U.S. alliance',
 'Park visits hospital to offer condolences for death of former president',
 'Police chief makes belated apology over protestor’s death',
 'Park expresses deep condolences over Florida shooting victims',
 'Park vows to consider recovery of sunken ferry',
 'Foreign correspondents say Park should address national divide',
 'Park condemns truck attack in southern France',
 'How the ferry Sewol sank and what it means',
 'Cabinet passes ordinance to probe ferry sinking',
 '[From the scene] Ordinary Koreans stage sit-in to protect deceased farmer’s body',
 'Part-time teachers to have Sewol deaths recognized',
 'Possible remains of Sewol victims found'

In [27]:
# Between country
most_related_docs(7, num_docs=20)['title'].tolist()

['Bulgaria, Korea celebrate future-oriented partnership',
 'Bulgaria upgrades partnership with Korea',
 'Turkey marks republican anniversary',
 'Serbia hails Korean investors in silver jubilee celebration',
 'Peru celebrates strategic alliance with Korea',
 'Egypt fetes 20 years of ties with Korea',
 '‘Kazakhstan wants Korea at Astana Expo 2017’',
 'Bulgaria, hotbed of ICT collaboration',
 'Russia, Korea mark growing ties at silver jubilee',
 '[Weekender] Reaching for the Eurasian dream',
 'Uzbekistan, Korea upgrade economic partnership',
 'Italian ambassador hosts last reception',
 '‘Dolphin diplomacy’ features in Seoul Defense Dialogue',
 'Experts talk over Tumen initiative for regional prosperity',
 'Park to meet Bulgarian president next week',
 'Colombian independence reception teems with festive music, dance',
 '72 pct of S. Koreans say Russia helpful for national security: poll',
 'Turkey marks independence, alliance with Korea',
 'Poland awaits investment as Europe’s transport h

In [28]:
# Unification
most_related_docs(34, num_docs=20)['title'].tolist()

['Red Cross to confirm surviving members of separated families',
 'Park calls for end of inter-Korean division',
 'S. Korean separated families gather ahead of reunions with N.K. relatives',
 'Koreas exchange rough lists of candidates for family reunions',
 'Koreas to exchange final lists of candidates for family reunions',
 'Koreas locked in talks on reuniting separated families',
 '‘Eurasia Express’ launched to boost regional ties',
 'Koreas agree to hold family reunions in late Oct.',
 'Remains of ex-commandos from 684 unit enshrined',
 "U.S. praises S. Korea's efforts to improve inter-Korean ties after family reunion agreement",
 'FM Yun to visit Poland, Germany on Eurasia Express program',
 'Hope grows for separated family reunion',
 'S. Korea pushing to relink inter-Korean railway: minister',
 'Young N.K. defectors keen on unification',
 'Eurasia Express trains set for 14,400-km journey',
 'Eurasia Express train ends journey in Berlin',
 'Two Koreas hold talks on family reunions'

In [29]:
# With North Korea
most_related_docs(48, num_docs=20)['title'].tolist()

['Korea OKs first civilian inter-Korean contact since Moon took office',
 "NK rejects S. Korean aid provider's inter-Korean exchanges, citing sanctions",
 "[Herald Interview] 'South’s humanitarian aid to North will move the world'",
 'S. Korea likely to resume supply of humanitarian aid to NK',
 'S. Korea approves $8m in aid to N. Korea via UN agencies',
 "S. Korea reviewing 19 civic groups' request for NK contact",
 'S. Korea to flexibly handle civilian inter-Korean exchanges',
 'New panel launched to review NK policies of preceding conservative govt.',
 'NK rejects S. Korean civilian offers to resume exchanges',
 'S. Korea mulling over $8m in aid to NK via UN agencies',
 'Kaesong firms undecided on whether to seek visit to N. Korea this month',
 'Seoul to support N.K. doctors’ training in Germany',
 'Closure of Kaesong complex unilaterally ordered by ex-leader Park: panel',
 'S. Korea offers largest amount of cash, goods to NK in 2003-2008: data',
 "Korea checking report on Park's le