In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation

from tqdm import tqdm
import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher

from rake_nltk import Rake

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)
df['agg_title_body'] = df['title'] + df['body']

df.shape

(23769, 7)

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens

In [5]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [6]:
def most_related_docs(df, doc_topic_dist, topic_index, num_docs=5):
    sorted_doc = doc_topic_dist.sort_values(by=['{}'.format(topic_index)], ascending=False)
    return df.iloc[sorted_doc[:num_docs].index]

In [7]:
park_str = "President Park Geun-hye's|President Park Geun-Hye's|President Park Geun Hye's|President Park Geun-hye|President Park Geun hye"
# df_park is extracted from df with president and park existing in doc's body
df_p = df[df['agg_title_body'].str.contains(park_str, flags=re.IGNORECASE, regex=True)]

In [8]:
p_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
p_data_vectorized = p_vectorizer.fit_transform(df_p['agg_title_body'])
joblib.dump(p_vectorizer, '../data/p_vectorizer.csv')
joblib.dump(p_data_vectorized, '../data/p_data_vectorized.csv')

['../data/p_data_vectorized.csv']

In [9]:
p_vectorizer = joblib.load('../data/p_vectorizer.csv')
p_data_vectorized = joblib.load('../data/p_data_vectorized.csv')

In [10]:
p_lda = LatentDirichletAllocation(n_components=50, random_state=0)
p_lda.fit(p_data_vectorized)
joblib.dump(p_lda, '../data/p_lda.csv')

NameError: name 'lda' is not defined

In [12]:
p_lda = joblib.load('../data/p_lda.csv')

In [13]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [28]:
print_top_words(p_lda, p_vectorizer, n_top_words=25)


Topic #0: say defense missile thaad seoul security south korea system deployment nuclear china u.s north korea government president north issue korean korea washington trump country threat military decision

Topic #1: say ministry government city airport korea aid korean group program seoul project assistance provide park humanitarian busan international support plan north official new $ local

Topic #2: water apartment general name rent ministry ihara junichi internet low housing shortage facebook rental iot plan northeast live safety device sunday aim ratio request construction

Topic #3: park choi president presidential scandal say prosecutor lee samsung charge prosecution investigation office arrest allegation suspect soon team sil group state woo geun court question

Topic #4: bill party assembly park national say president opposition parliamentary pass lawmaker government rule saenuri speaker law session parliament revision chung new year veto geun right

Topic #5: project russi



In [29]:
'''
LDA 50 topics
Topic 5: project russian
Topic 6: -
Topic 7: Heritage dokdo japan history
Topic 10: Culture
Topic 17: Japanese sex
Topic 18: Investigation
Topic 21: Nuclear
Topic 22: Twitter
Topic 31: woman university
Topic 32: sewol ferry
Topic 37: navy
Topic 38: politics
Topic 39,40: election
Topic 41: MERS
Topic 44: Innovation
'''

'\nLDA 50 topics\nTopic 0: Iran\nTopic 1: development\nTopic 2: ?\nTopic 3: ?\nTopic 4: city?\nTopic 5: disaster\nTopic 6: -\nTopic 7: Between country\nTopic 8: \nTopic 9: education\nTopic 10: economic\nTopic 11: residence\nTopic 12: sewol\nTopic 16: president\nTopic 19: MERS\nTopic 21: president park\nTopic 26: Party\nTopic 27: nuclear\nTopic 30: Electronics\nTopic 34: Unification\nTopic 36: mongolia\nTopic 48: North Korea\n'

In [30]:
p_doc_topic_dist = pd.DataFrame(p_lda.transform(p_data_vectorized))
p_doc_topic_dist.to_csv('../data/p_doc_topic_dist.csv', index=False)

In [31]:
p_doc_topic_dist = pd.read_csv('../data/p_doc_topic_dist.csv')

In [50]:
# Russian project
most_related_docs(df_p, p_doc_topic_dist, 5, num_docs=20)['title'].tolist()

['N. Korea improves coal shipment capacity at Rajin port',
 'Koreas, Russia start third test run for logistics project',
 '2 Koreas, Russia to carry out third run for pilot logistics project',
 'Containers carrying bottled water arrive in S. Korea via North Korean port',
 "S. Korea awaits N. Korea's nod for 3-way coal shipment",
 'Three foreigners to stand trial for illegal entry',
 'Vietnamese man detained for illegal entry',
 "[Newsmaker] Assembly begins 'resource diplomacy' probe",
 'W200b budget set for drought response',
 'Seoul to freeze trans-Korea project with Russia',
 'Prosecutors raid state oil firm over shady energy projects',
 'Cheong Wa Dae renews official webpage',
 "Hwang instructs gov't to carry out state affairs 'normally' amid opposition moves to restrain him",
 'Senior officials report huge wealth gains',
 'S. Korea mulling over how to name Moon‘s North Korea policy',
 'Obama in Korea to attend conference',
 "Botswana's president to visit S. Korea next week",
 'Nort

In [51]:
# Japanese and Korea
most_related_docs(df_p, p_doc_topic_dist, 7, num_docs=20)['title'].tolist()

['[Herald Interview] ‘Artists should be guaranteed unlimited freedom of expression’',
 '[Herald Interview] Knowledge key in Dokdo, comfort women issues',
 '[Lee Joo-hee] Let’s not sugarcoat the truth',
 "Park voices against Japan's world heritage bid",
 'CSIS chief: Troop presence in Korea serves US national interest',
 'FM Yun set for fence-mending trip to Japan',
 'FM Yun set for fence-mending trip to Japan',
 'Japanese books denying responsibility for wartime sexual slavery distributed to American scholars',
 'Japanese scholars demand U.S. textbook publisher correct sexual slavery descriptions',
 '[Herald Interview] ‘Korea sees new paradigm in democracy via nonviolent vigils’',
 'Park delivers pep talk to S. Korean troops in UAE',
 'Korea-China relations unlikely to suffer despite improvement in Seoul-Tokyo ties: U.S. expert',
 'FM Yun leaves for Japan to attend fence-mending talks',
 'Rep. Suh demands Tokyo uphold 1993 Kono apology',
 'Court backs state order to alter history textb

In [52]:
# Culture
most_related_docs(df_p, p_doc_topic_dist, 10, num_docs=20)['title'].tolist()

['France finishes celebrating 130th year of ties with Korea',
 'Park vows to boost IT-based cultural content industry',
 "Gov't to enact law to ensure rights of artists",
 'Park moves cultural diplomacy forward in France',
 'Park to build ecosystem of cultural content industry',
 'Park attends French food festival in Korea',
 "ASEM culture ministers' meeting kicks off in Gwangju",
 '‘Submarine crash may have caused Sewol sinking’',
 "'Cultural enrichment' key to Park's second-half agenda",
 "Park: 'Descendants of the Sun' exemplary case of cultural enhancement",
 'Business tycoons to be excluded from special pardons',
 "China's Xi vows to deepen tourism, cultural exchanges with S. Korea",
 'Xi vows to deepen tourism with S. Korea',
 'Prime Minister calls for more international cultural cooperation',
 'Ex-ambassador to France to be granted French order',
 'S. Korea aims to attract 8 million Chinese tourists this year',
 "Park calls for success in 'Korea Visit Years'",
 'Park attends eve

In [53]:
# Nuclear
most_related_docs(df_p, p_doc_topic_dist, 21, num_docs=20)['title'].tolist()

["Park calls North Korea's rocket launch 'outright disaster' for peace",
 'N.K. ups tension with short-range missiles',
 'Park renews resolve to make Pyongyang opt for denuclearization',
 'North Korea fans military tension',
 'N. Korea fans military tension',
 'N.K. ups tension with short-range missiles',
 'Park warns N. Korea could self-destruct unless it embraces change',
 'Park urges military readiness against N. Korean provocations',
 "Park: N. Korea's missile launch should never be condoned",
 'Park urges N.K. to ditch nukes',
 'N. Korea says it successfully conducted solid-fuel rocket test',
 "South Korea president urges 'strong' UN response to North rocket launch",
 'N.K. leader warns Seoul‘s gov’t organizations will be destroyed',
 'Park: South Korea detects signs of preparations for nuclear test in North Korea',
 "Park: North Korea shows signs of 'serious cracks'",
 "Park warns that N. Korea's provocations lead to self-destruction",
 'Park orders military to brace for possible

In [54]:
# Sewol
most_related_docs(df_p, p_doc_topic_dist, 32, num_docs=20)['title'].tolist()

['S. Korea to announce bid for recovery of sunken ferry this week',
 "Gov't concludes physical recovery of sunken ferry possible",
 '29 million tweets posted about Sewol ferry tragedy',
 'Sewol salvage work to start in September',
 'Sewol remembered nationwide',
 'After Sewol tragedy, doubts remain on safety overhaul',
 'Ruling party floor leader calls for recovery of sunken ferry',
 "[Reporter’s column] Government's half-baked plans on 'character building education' turn up more questions than answers",
 'Child care centers to go on strike over budget debacle',
 "Gov't to decide whether to recover Sewol this week",
 'Schools reopen after flawed MERS closures',
 '[Weekender] Criticism mounts over government’s fine dust countermeasure plan',
 'Cabinet passes ordinance to probe ferry sinking',
 '[Newsmaker] Salvaging ferry to be uphill battle',
 '‘Hanja’ education in elementary schools stirs dispute',
 'Korea to open six dammed reservoirs to improve inland water quality',
 'Park vows to 

In [55]:
# DISEASE
most_related_docs(df_p, p_doc_topic_dist, 41, num_docs=20)['title'].tolist()

['Aftershocks expected to continue: KMA',
 'Govt designates quake-hit Gyeongju as special disaster zone',
 'MERS-hit top hospital partially closed',
 'Ministers unveil health security packages',
 'Doubts grow over containment plans',
 'Park names new disease prevention chief',
 'U.S. reaffirms strong alliance with S. Korea despite bloody attack on ambassador',
 '[Newsmaker] Korea’s MERS concerns more about politics than health',
 'Park names doctor as head of disease control agency',
 'Korea identifies 24 MERS-affected hospitals amid growing concern over virus',
 'Quarantine team sent to Samsung hospital',
 'Nation goes all-out to contain MERS',
 'Seoul announces post-MERS plan for infectious diseases',
 'Park tours hospital with MERS patients',
 'After MERS, Korea to beef up health expertise',
 'S. Korea reports 5th death from MERS, 14 new cases',
 'Park meets MERS task force officials',
 'Park strongly denounces attack on U.S. envoy',
 'South Korea confirms 11th MERS death',
 'Envoy’

In [40]:
df_pp = df[(df['section'] == 'Politics') & (df['agg_title_body'].str.contains(park_str, flags=re.IGNORECASE, regex=True))]

In [42]:
pp_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
pp_data_vectorized = pp_vectorizer.fit_transform(df_pp['agg_title_body'])
joblib.dump(pp_vectorizer, '../data/pp_vectorizer.csv')
joblib.dump(pp_data_vectorized, '../data/pp_data_vectorized.csv')

['../data/pp_data_vectorized.csv']

In [43]:
pp_vectorizer = joblib.load('../data/pp_vectorizer.csv')
pp_data_vectorized = joblib.load('../data/pp_data_vectorized.csv')

In [44]:
pp_lda = LatentDirichletAllocation(n_components=50, random_state=0)
pp_lda.fit(pp_data_vectorized)
joblib.dump(pp_lda, '../data/pp_lda.csv')

['../data/pp_lda.csv']

In [45]:
pp_lda = joblib.load('../data/pp_lda.csv')

In [56]:
print_top_words(pp_lda, pp_vectorizer, n_top_words=25)


Topic #0: sejong ultimate essential "structural reiterate 2002.she deregulation.park workers.critic respectively 2015.she inflexibility seoul.park marketpresident market.one 40,000.park fundamental march.the said.president 10the party.in korea.before salaried overprotect compile reinvigorate

Topic #1: north park unification north korea country south korea corruption official pyongyang say military yonhap call border year warn korean yonhap)president anti accuse uncertainty national step international luncheon

Topic #2: nis national say agency intelligence park president presidential assembly official office spy law hearing government time unification parliamentary hye geun opposition service administration wa dae

Topic #3: election trump day public report president say korea nec ship park face presidential bulgarian hour plevneliev call kim come lunney restore remain hero sentence fairly

Topic #4: sim anti public work corruption presidential female care party say medical korea pre

In [47]:
'''
LDA Park Politics 50 topics
Topic 1: unification
Topic 3: Trump
Topic 4: Corruption
Topic 5: education
Topic 6: election
Topic 7: election
Topic 17: nuclear security
Topic 18: labor
Topic 21: Strategy
Topic 41: Innovation
Topic 44: sexual victim
Topic 49: energy from middle east
'''

'\nLDA Park Politics 50 topics\nTopic 1: unification\nTopic 3: Trump\nTopic 4: Corruption\nTopic 5: education\nTopic 6: election\nTopic 7: election\nTopic 17: nuclear security\nTopic 18: labor\nTopic 21: Strategy\nTopic 41: Innovation\nTopic 44: sexual victim\nTopic 49: energy from middle east\n'

In [48]:
pp_doc_topic_dist = pd.DataFrame(pp_lda.transform(pp_data_vectorized))
pp_doc_topic_dist.to_csv('../data/pp_doc_topic_dist.csv', index=False)

In [57]:
# Unification
most_related_docs(df_pp, pp_doc_topic_dist, 1, num_docs=20)['title'].tolist()

KeyError: '1'

In [None]:
# Trump
most_related_docs(df_pp, pp_doc_topic_dist, 3, num_docs=20)['title'].tolist()

In [None]:
# Education
most_related_docs(df_pp, pp_doc_topic_dist, 5, num_docs=20)['title'].tolist()

In [None]:
# Innovation
ppp_most_related_docs(41, num_docs=20)['title'].tolist()