In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from tqdm import tqdm
import spacy
from spacy.matcher import Matcher

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Modern\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Modern\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
pd.set_option('display.max_colwidth', 2000)
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)
df['agg_title_body'] = df['title'] + df['body']

df.shape

(23769, 7)

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"
PARK_WORD = "Park Geun-hye"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
    
    # Change to "Park Geun-hye"
    "President Park Geun-hye's": PARK_WORD,
    "President Park Geun-Hye's": PARK_WORD,
    "President Park Geun Hye's": PARK_WORD,
    "President Park Geun-hye": PARK_WORD,
    "President Park Geun-Hye": PARK_WORD,
    "President Park Geun Hye": PARK_WORD,
    "President Park's": PARK_WORD,
    "President Park": PARK_WORD,
    "Park Geun-hye's": PARK_WORD,
    "Park Geun-Hye's": PARK_WORD,
    "Park Geun Hye's": PARK_WORD,
    "Park Geun-Hye": PARK_WORD,
    "Park Geun Hye": PARK_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens

In [5]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [6]:
def most_related_docs(df, doc_topic_dist, topic_index, num_docs=5):
    sorted_doc = doc_topic_dist.sort_values(by=['{}'.format(topic_index)], ascending=False)
    return df.iloc[sorted_doc[:num_docs].index]

In [7]:
park_str = "President Park Geun-hye's|President Park Geun-Hye's|President Park Geun Hye's|President Park Geun-hye|President Park Geun hye"
# df_park is extracted from df with president and park existing in doc's body
df_p = df[df['agg_title_body'].str.contains(park_str, flags=re.IGNORECASE, regex=True)]

In [8]:
# p_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
# p_data_vectorized = p_vectorizer.fit_transform(df_p['agg_title_body'])
# joblib.dump(p_vectorizer, '../data/p_vectorizer.csv')
# joblib.dump(p_data_vectorized, '../data/p_data_vectorized.csv')

In [9]:
p_vectorizer = joblib.load('../data/p_vectorizer.csv')
p_data_vectorized = joblib.load('../data/p_data_vectorized.csv')

In [10]:
# p_lda = LatentDirichletAllocation(n_components=50, random_state=0)
# p_lda.fit(p_data_vectorized)
# joblib.dump(p_lda, '../data/p_lda.csv')

In [11]:
p_lda = joblib.load('../data/p_lda.csv')

In [12]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [13]:
print_top_words(p_lda, p_vectorizer, n_top_words=25)


Topic #0: singapore korea country energy lee year green people emission ambassador germany carbon turkey national malaysia minister service global funeral trade east world percent singaporean wind

Topic #1: trump say president alliance south korea korea korean donald house ally security state washington the united states strong u.s country obama secretary white park yonhap continue visit south

Topic #2: family reunion separate korean koreas hold cross member south north korea talk north red war say training issue inter deal separated force year park humanitarian side

Topic #3: u.n resolution english human sanction right ban north council russia north korea country language abuse international adopt member foreign russian korean koreans official work export u.s

Topic #4: prize nobel truck man park report jtbc kim murder president door seoul nephew police ji brother joo find scene winner time break main article drive

Topic #5: north korean say north korea military south kim south k

In [14]:
'''
LDA 50 topics
Topic 9: Iran
Topic 36: park impeachment
'''

'\nLDA 50 topics\nTopic 9: Iran\nTopic 36: park impeachment\n'

In [15]:
# p_doc_topic_dist = pd.DataFrame(p_lda.transform(p_data_vectorized))
# p_doc_topic_dist.to_csv('../data/p_doc_topic_dist.csv', index=False)

In [16]:
p_doc_topic_dist = pd.read_csv('../data/p_doc_topic_dist.csv')

In [38]:
# Park education
most_related_docs(df_p, p_doc_topic_dist, 1, num_docs=3)['title']

5842    State Department: US looks forward to continuing close cooperation with Korea's next president
9814                                         Trump to hold 'private meeting' with Abe: transition team
8535                                            Trump reiterates 'ironclad commitment' to defend Korea
Name: title, dtype: object

In [22]:
df_pp = df[(df['section'] == 'Politics') & (df['agg_title_body'].str.contains(park_str, flags=re.IGNORECASE, regex=True))]

In [23]:
# pp_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
# pp_data_vectorized = pp_vectorizer.fit_transform(df_pp['agg_title_body'])
# joblib.dump(pp_vectorizer, '../data/pp_vectorizer.csv')
# joblib.dump(pp_data_vectorized, '../data/pp_data_vectorized.csv')

In [24]:
pp_vectorizer = joblib.load('../data/pp_vectorizer.csv')
pp_data_vectorized = joblib.load('../data/pp_data_vectorized.csv')

In [25]:
# pp_lda = LatentDirichletAllocation(n_components=50, random_state=0)
# pp_lda.fit(pp_data_vectorized)
# joblib.dump(pp_lda, '../data/pp_lda.csv')

In [26]:
pp_lda = joblib.load('../data/pp_lda.csv')

In [27]:
print_top_words(pp_lda, pp_vectorizer, n_top_words=25)


Topic #0: party saenuri rep park leader say lawmaker rule kim yoo opposition election new member political president floor committee conservative parliamentary people national leadership minjoo reform

Topic #1: park mer say virus outbreak respiratory mers disease east syndrome south korea health seoul spread center government contain effort hospital patient middle country meeting response public

Topic #2: lee minister npad hearing prime nominee opposition confirmation party lawmaker parliamentary saenuri session vote assembly national rep new main say report allegation floor alliance koo

Topic #3: park court president say impeachment presidential choi rally geun office hearing hye constitutional trial scandal hold justice seoul chief soon ruling official police dae sil

Topic #4: park president pardon year special say geun day hye ministry grant chung national public korea official presidential child liberation kim people business seoul political take

Topic #5: bill party saenuri 

In [29]:
# pp_doc_topic_dist = pd.DataFrame(pp_lda.transform(pp_data_vectorized))
# pp_doc_topic_dist.to_csv('../data/pp_doc_topic_dist.csv', index=False)

In [30]:
pp_doc_topic_dist = pd.read_csv('../data/pp_doc_topic_dist.csv')

In [39]:
# Dokdo
most_related_docs(df_pp, pp_doc_topic_dist, 30, num_docs=20)['title']

21518                                         Assembly resolution condemns Japan
5829                              [Newsmaker] Daunting tasks await new president
22602                     President battles public skepticism as 3rd year begins
21226                                      Park makes slow recovery from illness
16199      Park names North Korea expert as head of advisory body on unification
7533                             Park's lawyer says to accept impeachment ruling
9427                                     [Graphic News] Scandal probe in numbers
9725          Prosecution requests president answer questions related to scandal
16874                                   Park calls for economic contingency plan
14151                       Park offers condolences to Ecuador over deadly quake
19537                                     Park to take summer vacation next week
9403                               [Graphic News] Race for next presidency is on
9654     Park accepts justic

In [40]:
'''
Example of related-issue
8471:                                                  Park request 15 more witnase
15914                                       Park names new disease prevention chief
17845                                         Park, Moon clash over history textbooks
20506                                        Park tours hospital with MERS patients
22807                           Park to build ecosystem of cultural content industry
'''

'\n8471: Park request 15 more witnase\n15914                                       Park names new disease prevention chief\n17845                                         Park, Moon clash over history textbooks\n20506                                        Park tours hospital with MERS patients\n22807                           Park to build ecosystem of cultural content industry\n'

In [46]:
related_issue_index = [8471,15914,17845,20506,22807]

['Park to build ecosystem of cultural content industry',
 'Park tours hospital with MERS patients',
 'Park, Moon clash over history textbooks',
 'Park names new disease prevention chief',
 'Park requests 15 more witnesses at impeachment trial']

In [47]:
# Extract entities from sentences
def extract_entity(sents, index, df):
    print("[ Detailed Information (per event) ]\n")
    
    ent_per = []
    ent_org = []
    ent_loc = []
    
    event = df.iloc[index].sort_values(by=['time'])['title'].tolist()
    
    for i in reversed(range(len(sents))):
        doc = nlp(sents[i])
        for ent in doc.ents:
            word = ent.text.title()
            if word in ent_per or word in ent_org or word in ent_loc:
                continue
            if ent.label_ == 'PERSON':
                ent_per.append(word)
            elif ent.label_ == 'ORG':
                ent_org.append(word)
            elif ent.label_ in ['GPE', 'LOC']:
                ent_loc.append(word)
        
        print("Event: ", event[i])
        print("- Person: ", ", ".join([i for i in ent_per]))
        print("- Organization: ", ", ".join([i for i in ent_org]))
        print("- Place: ", ", ".join([i for i in ent_loc]))
        print()

In [54]:
def print_events(df, index):
    event = df.iloc[index].sort_values(by=['time'])['title'].tolist()
    print("[ Related-Issue Events ]\n")
    print(", ".join([i for i in event]))
    print()

In [55]:
sents_p = [ ' '.join(spacy_tokenizer(s)) for s in df.iloc[related_issue_index].sort_values(by=['time'])['body']]
print_events(df, related_issue_index)
extract_entity(sents_p, related_issue_index, df)

[ Related-Issue Events ]

Park to build ecosystem of cultural content industry, Park tours hospital with MERS patients, Park, Moon clash over history textbooks, Park names new disease prevention chief, Park requests 15 more witnesses at impeachment trial

[ Detailed Information (per event) ]

Event:  Park requests 15 more witnesses at impeachment trial
- Person:  Park Geun, Lee Joong Hwan, Lee, Lee Jung Mi, Han Chul
- Organization:  
- Place:  Geun

Event:  Park names new disease prevention chief
- Person:  Park Geun, Lee Joong Hwan, Lee, Lee Jung Mi, Han Chul, Jung Ki, Bae Hyun Jung
- Organization:  
- Place:  Geun

Event:  Park, Moon clash over history textbooks
- Person:  Park Geun, Lee Joong Hwan, Lee, Lee Jung Mi, Han Chul, Jung Ki, Bae Hyun Jung, Cheong Wa Dae, Npad, Moon Jae, Hye Greet, Kim Moo, Moon Jae Cheong Wa Dae, Yoo Chul Npad, Lee Jong Kul, Kim Moon, Kim, Kim Sung, Kim Support Park, Hwang Kyo Ahn South Korea, Hwang
- Organization:  Saenuri Party, China Vietnam New Zealand