In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation

from tqdm import tqdm

import spacy
from spacy.matcher import Matcher

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Modern\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Modern\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
pd.set_option('max_colwidth',200)
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

df['agg_title_body'] = df['title'] + df['body']

df.shape

(23769, 7)

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(new_str)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens


In [5]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [6]:
def most_related_docs(df, doc_topic_dist, topic_index, num_docs=5):
    sorted_doc = doc_topic_dist.sort_values(by=['{}'.format(topic_index)], ascending=False)
    return df.iloc[sorted_doc[:num_docs].index]

In [7]:
df_nk = df[df['section'] == 'North Korea']

In [8]:
# # Find all doc contains "North Korea"
# nk_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
# nk_data_vectorized = nk_vectorizer.fit_transform(df_nk['agg_title_body'])
# joblib.dump(nk_vectorizer, 'data/nk_vectorizer.csv')
# joblib.dump(nk_data_vectorized, 'data/nk_data_vectorized.csv')

In [9]:
nk_vectorizer = joblib.load('data/nk_vectorizer.csv')
nk_data_vectorized = joblib.load('data/nk_data_vectorized.csv')

In [10]:
nk_lda_components = 50
# nk_lda = LatentDirichletAllocation(n_components=nk_lda_components, random_state=0)
# nk_lda.fit(nk_data_vectorized)
# joblib.dump(nk_lda, 'data/nk_{}lda.csv'.format(nk_lda_components))

In [11]:
nk_lda = joblib.load('data/nk_{}lda.csv'.format(nk_lda_components))

In [12]:
print_top_words(nk_lda, nk_vectorizer, n_top_words=25)


Topic #0: north korean defector south say ministry south korea koreans seoul year unification restaurant government country percent defect number official korea worker north korea accord defection minister come

Topic #1: military korean say north south korea drill korea south north korea exercise joint seoul defense official provocation war attack ministry pyongyang u.s nuclear plan tension strike carry

Topic #2: ` north korea talk nuclear say hwang envoy launch hold korean party north pyongyang japan beijing rocket china south korea u.s wu korea resume late kook counterpart

Topic #3: f-35 europe asylum request european b-1bs vigilant practice statement.the interoperability ace refuge statistic contribute yonhap)among grant herim@heraldcorp.com rim sweden rejected.by dropsthe strict luxembourg eurostat bulgaria

Topic #4: russia russian moscow kim visit north korea world attend putin say ceremony leader diplomatic tie president invite war invitation china youth foreign vladimir kor

In [13]:
# nk_doc_topic_dist = pd.DataFrame(nk_lda.transform(nk_data_vectorized))
# nk_doc_topic_dist.to_csv('data/nk_doc_topic_dist.csv', index=False)

In [14]:
nk_doc_topic_dist = pd.read_csv('data/nk_doc_topic_dist.csv')

In [15]:
#Select 5 topics that no word missile included in top 10 words
def related_issue_event(num_event, unwanted_word, model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    candidate_topic_idx = []
    for topic_idx, topic in enumerate(model.components_):
        if unwanted_word not in [ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]:
            candidate_topic_idx.append(topic_idx)
    candidate_event_idx = []
    for candidate in candidate_topic_idx:
        candidate_event_idx.append(most_related_docs(df_nk, nk_doc_topic_dist, candidate, num_docs=1).index.tolist()[0])
        if len(candidate_event_idx) > num_event:
            break
    return candidate_event_idx

In [16]:
related_issue_index = related_issue_event(5, 'missile', nk_lda, nk_vectorizer, n_top_words=10)

In [17]:
# Extract entities from sentences
def extract_entity(sents, index, df):
    print("[ Detailed Information (per event) ]\n")
    
    ent_per = []
    ent_org = []
    ent_loc = []
    
    event = df.iloc[index].sort_values(by=['time'])['title'].tolist()
    
    for i in reversed(range(len(sents))):
        doc = nlp(sents[i])
        for ent in doc.ents:
            word = ent.text.title()
            if word in ent_per or word in ent_org or word in ent_loc:
                continue
            if ent.label_ == 'PERSON':
                ent_per.append(word)
            elif ent.label_ == 'ORG':
                ent_org.append(word)
            elif ent.label_ in ['GPE', 'LOC']:
                ent_loc.append(word)
        
        print("Event: ", event[i])
        print("- Person: ", ", ".join([i for i in ent_per]))
        print("- Organization: ", ", ".join([i for i in ent_org]))
        print("- Place: ", ", ".join([i for i in ent_loc]))
        print()

In [18]:
def print_events(df, index):
    event = df.iloc[index].sort_values(by=['time'])['title'].tolist()
    print("[ Related-Issue Events ]\n")
    print(", ".join([i for i in event]))
    print()

In [19]:
sents_nk = [ ' '.join(spacy_tokenizer(s)) for s in df.iloc[related_issue_index].sort_values(by=['time'])['body']]
print_events(df, related_issue_index)
extract_entity(sents_nk, related_issue_index, df)

[ Related-Issue Events ]


[ Detailed Information (per event) ]

Event:  Number of N. Koreans seeking asylum in Europe drops
- Person:  
- Organization:  Eu Statistic Agency, Asylum European Union, European Free Trade Association
- Place:  Europe, Yonhap)Among, Germany, Britain, Sweden, Luxembourg, Bulgaria, East Africa

- Person:  Kim Dong Chul
- Organization:  Eu Statistic Agency, Asylum European Union, European Free Trade Association
- Place:  Europe, Yonhap)Among, Germany, Britain, Sweden, Luxembourg, Bulgaria, East Africa, The United States, North Korea, Republic Korea, Pyongyang, Washington

Event:  S. Korea unveils measures to support NK defectors
- Person:  Kim Dong Chul, Kim Jong, Thae Yong Ho, North Embassy
- Organization:  Eu Statistic Agency, Asylum European Union, European Free Trade Association, Hanawon Education Program, Seoul Unification Ministry
- Place:  Europe, Yonhap)Among, Germany, Britain, Sweden, Luxembourg, Bulgaria, East Africa, The United States, North Korea, 