In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

df.shape

(23769, 6)

In [3]:
# Load large spacy model 
nlp = spacy.load('en_core_web_lg')

# Aggregate title and content
title_weight = 1
df['agg_title_body'] = title_weight*(df['title']+'. ') + df['body']

### Embed document and clustering
df2017 = df['2017' < df['time']]
df2016 = df[('2016' < df['time']) & (df['time'] < '2017')]
df2015 = df[('2015' < df['time']) & (df['time'] < '2016')]

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

# Using spacy to preprocess
def preprocess_spacy(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    new_str = ' '.join([ token.lemma_.lower() for token in tokens ])
    return new_str, tokens, doc

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens

### Preprocess function for grouping similar topic
def preprocess_manual(s: str):
    # Change similar words to the same word
    new_str = transform_to_similar_sentence(s)
    # Remove punctuation
    new_str = ''.join(ch if ch not in set(punctuation) else " " for ch in new_str)
    # Remove all single characters
    new_str = re.sub(r'\W', ' ', new_str)
    new_str = re.sub(r'\s+[a-zA-Z]\s+', ' ', new_str)
    new_str = re.sub(r'\^[a-zA-Z]\s+', ' ', new_str) 
    # Substituting multiple spaces with single space
    new_str = re.sub(r'\s+', ' ', new_str, flags=re.I)
    # Removing prefixed 'b' - when data is in bytes format
    new_str = re.sub(r'^b\s+', '', new_str)
    # Removing all numbers
    new_str = new_str.translate(str.maketrans('', '', digits))
    # Converting to Lowercase
    new_str = new_str.lower()
    # Lemmatization and remove stopwords
    new_str = new_str.split()
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [stemmer.lemmatize(word) for word in new_str if word not in stopwords]
    new_str = ' '.join(tokens)
    
    return new_str, tokens

In [5]:
### Clustering 
def document_clustering(doc_vectors, clustering_method='kmeans', evaluate=False):
    if clustering_method=='kmeans':
        # Hyperparameters
        k_event = 10000
        k_issue = 6000
        
        # Clustering event
        kmeans_event = KMeans(n_clusters=k_event, random_state=69).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((k_event, doc_vectors.shape[1]))
        for i in range(k_event):
            event_vectors[i] = sum(doc_vectors[kmeans_event.labels_ == i])
        
        # Clustering issue
        kmeans_issue = KMeans(n_clusters=k_issue, random_state=69).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((k_issue, doc_vectors.shape[1]))
        for i in range(k_issue):
            issue_vectors[i] = sum(event_vectors[kmeans_issue.labels_ == i])

        issue_labels = np.array([ kmeans_issue.labels_[kmeans_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return k_issue, k_event, issue_labels, kmeans_event.labels_
    
    elif clustering_method=='DBSCAN':
        
        # Hyperparameters
        doc_eps = 0.190
        doc_neighbors = 1
        event_eps = 0.50
        event_neighbors = 1
        
        '''
            Find best doc_eps and event_eps
        '''
        if evaluate:
            # Find best eps to group same document
            doc_eps_list = [ 0.10 + 0.001*i for i in range(1,301) ]
            doc_score = []
            doc_event = []
            doc_best_score = 0
            doc_best_eps = 0.0001
            for doc_eps in doc_eps_list:
                # Clustering event
                db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
                # Number of clusters in labels, ignoring noise if present.
                n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
                if len(set(db_event.labels_)) >= 2 and len(set(db_event.labels_)) <= len(doc_vectors)-1:
                    score_ = silhouette_score(doc_vectors, db_event.labels_)
                else:
                    score_ = -1
                doc_event.append(n_events_)
                doc_score.append(score_)
                if score_ > doc_best_score:
                    doc_best_score = score_
                    doc_best_eps = doc_eps
            print("Best Silhouete score is {} at eps: {} and number of events: {}".format(doc_best_score, doc_eps, n_events_))
            fig = plt.figure()
            plt.plot(doc_eps_list, doc_score)
            fig.suptitle('Doc eps and Silhouette score', fontsize=20)
            plt.xlabel('eps', fontsize=18)
            plt.ylabel('Silhouette score', fontsize=16)
            plt.show()
            
            fig = plt.figure()
            plt.plot(doc_eps_list, doc_event)
            fig.suptitle('Doc eps and number of events', fontsize=20)
            plt.xlabel('eps', fontsize=18)
            plt.ylabel('number of events', fontsize=16)
            plt.show()
            
            # Set doc_eps to the best value
            doc_eps = doc_best_eps
            # Find best eps to group same event
            # Clustering event
            db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
            n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
            event_labels = np.array(list(map(lambda x: n_events_ if x==-1 else x, db_event.labels_)))
            event_vectors = np.zeros((n_events_, doc_vectors.shape[1]))
            for i in range(n_events_+1):
                if np.sum(event_labels == i) != 0:
                    event_vectors[i] = np.sum(doc_vectors[event_labels == i], axis=0)/np.sum(event_labels == i)
            
            
#             # Clustering issue
#             event_eps_list = [ 0.2 + 0.001*i for i in range(1,401) ]
#             event_score = []
#             event_issue = []
#             event_best_score = 0
#             event_best_eps = 0.001
#             for event_eps in event_eps_list:
#                 db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
#                 # Number of clusters in labels, ignoring noise if present.
#                 n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
#                 if len(set(db_issue.labels_)) >= 2 and len(set(db_issue.labels_)) <= len(event_vectors)-1:
#                     score_ = silhouette_score(event_vectors, db_issue.labels_)
#                 else:
#                     score_ = -1
#                 event_issue.append(n_issues_)
#                 event_score.append(score_)
#                 if score_ > event_best_score:
#                     event_best_score = score_
#                     event_best_eps = event_eps
#             print("Best Silhouete score is {} at eps: {} and number of issues: {}".format(event_best_score, event_eps, n_issues_))
#             fig = plt.figure()
#             plt.plot(event_eps_list, event_score)
#             fig.suptitle('Event eps and Silhouette score', fontsize=20)
#             plt.xlabel('eps', fontsize=18)
#             plt.ylabel('Silhouette score', fontsize=16)
#             plt.show()
            
#             fig = plt.figure()
#             plt.plot(event_eps_list, event_issue)
#             fig.suptitle('Event eps and number of issues', fontsize=20)
#             plt.xlabel('eps', fontsize=18)
#             plt.ylabel('number of issues', fontsize=16)
#             plt.show()
            
            # Set event_eps to best value
            event_eps = 0.5
            # Clustering issue
            db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
            n_noise_ = list(db_issue.labels_).count(-1)
            # Represent each issue by average sum of related news
            issue_labels = np.array(list(map(lambda x: n_issues_ if x==-1 else x, db_issue.labels_)))
            issue_vectors = np.zeros((n_issues_, doc_vectors.shape[1]))
            for i in range(n_issues_+1):
                if np.sum(issue_labels == i) != 0:
                    issue_vectors[i] = np.sum(event_vectors[issue_labels == i], axis=0)/np.sum(issue_labels == i)
       
            issue_labels = np.array([ issue_labels[event_labels[i]] for i in range(doc_vectors.shape[0]) ])
        
        else:
            '''
            Clustering using specific value
            '''
            # Clustering event
            db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
            n_noise_ = list(db_event.labels_).count(-1)
            print("1st cluster:\n\tThe number of cluster is {}".format(n_events_))
            # Represent each event by average sum of related news
            event_labels = np.array(list(map(lambda x: n_events_ if x==-1 else x, db_event.labels_)))
            event_vectors = np.zeros((n_events_, doc_vectors.shape[1]))
            for i in range(n_events_+1):
                if np.sum(event_labels == i) != 0:
                    event_vectors[i] = np.sum(doc_vectors[event_labels == i], axis=0)/np.sum(event_labels == i)

            # Clustering issue
            db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
            n_noise_ = list(db_issue.labels_).count(-1)
            print("2nd cluster:\n\tThe number of cluster is {}".format(n_issues_))
            # Represent each issue by average sum of related news
            issue_labels = np.array(list(map(lambda x: n_issues_ if x==-1 else x, db_issue.labels_)))
            issue_vectors = np.zeros((n_issues_, doc_vectors.shape[1]))
            for i in range(n_issues_+1):
                if np.sum(issue_labels == i) != 0:
                    issue_vectors[i] = np.sum(event_vectors[issue_labels == i], axis=0)/np.sum(issue_labels == i)
        
            issue_labels = np.array([ issue_labels[event_labels[i]] for i in range(doc_vectors.shape[0]) ])
        
        return n_issues_, n_events_, issue_labels, event_labels
    
    elif clustering_method=='agglomerative':
        # Hyperparameters
        n_events = 10000
        n_issues = 6000
        
        # Clustering event
        agg_event = AgglomerativeClustering(distance_threshold=0, n_clusters=n_events).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((n_events, doc_vectors.shape[1]))
        for i in range(n_events):
            event_vectors[i] = sum(doc_vectors[agg_event.labels_ == i])
        
        plt.title("Hierarchical Clustering Dendrogram")
        # plot the top three levels of the dendrogram
        plot_dendrogram(agg_event, truncate_mode="level", p=3)
        plt.xlabel("Number of points in node (or index of point if no parenthesis).")
        plt.show()
        
        # Clustering issue
        agg_issue = AgglomerativeClustering(distance_threshold=0, n_clusters=n_issues).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((n_issues, doc_vectors.shape[1]))
        for i in range(n_issues):
            issue_vectors[i] = sum(event_vectors[agg_issue.labels_ == i])

        issue_labels = np.array([ agg_issue.labels_[agg_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return agg_issue, agg_event, issue_labels, agg_event.labels_
    
    elif clustering_method=='LDA':
        
        pass
    
    else:
        assert("Doesn't support {}".format(clustering_method))   

In [6]:
# Load data
tfidf_doc2017_vectors = joblib.load('tfidf_titlebody_2017.csv')
tfidf_doc2016_vectors = joblib.load('tfidf_titlebody_2016.csv')
tfidf_doc2015_vectors = joblib.load('tfidf_titlebody_2015.csv')

In [7]:
tfidf_doc2015_num_issue, tfidf_doc2015_num_event, tfidf_doc2015_issue_labels, tfidf_doc2015_event_labels = document_clustering(tfidf_doc2015_vectors, clustering_method='DBSCAN', evaluate=False)
tfidf_doc2016_num_issue, tfidf_doc2016_num_event, tfidf_doc2016_issue_labels, tfidf_doc2016_event_labels = document_clustering(tfidf_doc2016_vectors, clustering_method='DBSCAN', evaluate=False)
tfidf_doc2017_num_issue, tfidf_doc2017_num_event, tfidf_doc2017_issue_labels, tfidf_doc2017_event_labels = document_clustering(tfidf_doc2017_vectors, clustering_method='DBSCAN', evaluate=False)

from collections import Counter
counter_2017_event = Counter(tfidf_doc2017_event_labels)
counter_2016_event = Counter(tfidf_doc2016_event_labels)
counter_2015_event = Counter(tfidf_doc2015_event_labels)

top10_event_2017 = counter_2017_event.most_common(10)
top10_event_2016 = counter_2016_event.most_common(10)
top10_event_2015 = counter_2015_event.most_common(10)

1st cluster:
	The number of cluster is 6914
2nd cluster:
	The number of cluster is 1159
1st cluster:
	The number of cluster is 7335
2nd cluster:
	The number of cluster is 1244
1st cluster:
	The number of cluster is 8997
2nd cluster:
	The number of cluster is 1517


In [8]:
for (cluster_id, number_event) in top10_event_2017:
    candidates = df2017[tfidf_doc2017_event_labels == cluster_id]['body']
    docs = nlp.pipe(candidates)
    ent_list = dict()
    for doc in docs:
        for ent in doc.ents:
            if not ent.label_ in ['MONEY','TIME','CARDINAL','QUANTITY','DATE','ORDINAL','PERCENT']:
                if ent.label_ in ent_list:
                    ent_list[ent.label_].append(ent.text)
                else:
                    ent_list[ent.label_] = [ent.text]
    print("Cluster id {}".format(cluster_id))
    for k in ent_list:
        v = ent_list[k]
        ent_counter = Counter(v)
        top10item = ent_counter.most_common(10)
        print("\t{}: {}".format(k, top10item))
#     print(ent_counter.most_common(10))

Cluster id 67
	GPE: [('Seoul', 19), ('Gangwon Province', 13), ('Busan', 9), ('Gyeonggi Province', 8), ('Incheon', 5), ('Chungcheong', 5), ('Korea', 4), ('Daegu', 3), ('Gangneung', 2), ('Gwangju', 2)]
	ORG: [('Chuncheon', 9), ('Daejeon', 7), ('the Korea Meteorological Administration', 3), ('KMA', 3), ('Yonhap)The Korea Meteorological Administration', 3), ('Cheongju', 2), ('The Korea Meteorological Administration', 2), ('Yonhap)The KMA', 1), ('mercury', 1), ('C.The KMA', 1)]
	NORP: [('Yonhap)The', 1)]
	PERSON: [('Park Ju-young', 11), ('Gyeonggi', 7), ('North Chungcheong', 2), ('Gangwon', 1), ('Gyeongsang', 1)]
	FAC: [('Gangneung 2', 1)]
	LOC: [('west coast', 2), ('Jeju Island', 1), ('Northern Gyeonggi Province', 1)]
Cluster id 617
	NORP: [('Korean', 6), ('Japanese', 2), ('South Korean', 1), ('Gunbuk', 1), ('Russian', 1)]
	GPE: [('South Korea', 3), ('Gangwon Province', 3), ('Seoul', 2), ('Haman', 1), ('South Gyeongsang Province', 1), ('Yonhap)The', 1), ('Yanggu', 1), ('Korea', 1), ('Mount

In [9]:
for (cluster_id, number_event) in top10_event_2016:
    candidates = df2016[tfidf_doc2016_event_labels == cluster_id]['body']
    docs = nlp.pipe(candidates)
    ent_list = dict()
    for doc in docs:
        for ent in doc.ents:
            if not ent.label_ in ['MONEY','TIME','CARDINAL','QUANTITY','DATE','ORDINAL','PERCENT']:
                if ent.label_ in ent_list:
                    ent_list[ent.label_].append(ent.text)
                else:
                    ent_list[ent.label_] = [ent.text]
    print("Cluster id {}".format(cluster_id))
    for k in ent_list:
        v = ent_list[k]
        ent_counter = Counter(v)
        top10item = ent_counter.most_common(10)
        print("\t{}: {}".format(k, top10item))
#     print(ent_counter.most_common(10))

Cluster id 16
	GPE: [('South Korea', 14), ('North Korea', 12), ('Seoul', 6), ("North Korea's", 5), ('Pyongyang', 5), ('Koreas', 5), ('North', 1), ('Korea', 1), ('Washington', 1)]
	ORG: [('Radio Pyongyang', 5), ("Workers' Party of Korea", 1)]
	LAW: [('Page 894', 1)]
	EVENT: [('the Cold War', 5)]
	PERSON: [('Yonhap', 4)]
	NORP: [('North Korean', 1)]
	LOC: [('North', 1)]
Cluster id 5095
	NORP: [('South Korean', 1), ('Koreans', 1)]
	ORG: [('the National Election Commission', 4), ('NEC', 3), ('National Assembly', 1), ('Saenuri Party', 1), ('Daejeon', 1)]
	GPE: [('Seoul', 4), ('South Korea’s', 3), ('Daegu', 2), ('South Jeolla Province', 2), ('South Jeolla', 1), ('Sejong', 1), ('Busan', 1), ('Gangwon Province', 1)]
	PERSON: [('Yonhap', 1)]
	EVENT: [('Daegu', 1)]
Cluster id 431
	PERSON: [('Park', 28), ('Park Geun-hye', 14), ('Choi Soon-sil', 6), ('Kim Seong-ju', 3), ('Choi', 3), ('Cheong Wa Dae', 3), ('Kim Hye-yeon', 2), ('Choi Jeong-seon', 2), ('Park Jae-young', 2), ('Ock Hyun-ju & Bak Se-hwa

In [10]:
for (cluster_id, number_event) in top10_event_2015:
    candidates = df2015[tfidf_doc2015_event_labels == cluster_id]['body']
    docs = nlp.pipe(candidates)
    ent_list = dict()
    for doc in docs:
        for ent in doc.ents:
            if not ent.label_ in ['MONEY','TIME','CARDINAL','QUANTITY','DATE','ORDINAL','PERCENT']:
                if ent.label_ in ent_list:
                    ent_list[ent.label_].append(ent.text)
                else:
                    ent_list[ent.label_] = [ent.text]
    print("Cluster id {}".format(cluster_id))
    for k in ent_list:
        v = ent_list[k]
        ent_counter = Counter(v)
        top10item = ent_counter.most_common(10)
        print("\t{}: {}".format(k, top10item))
#     print(ent_counter.most_common(10))

Cluster id 2954
	GPE: [('South Korea', 12), ('Saudi Arabia', 6)]
	LOC: [('Middle East Respiratory Syndrome', 6)]
	ORG: [('MERS', 18), ('the Ministry of Health and Welfare', 6), ('the health ministry', 4)]
	PERSON: [('MERS', 6), ('Yonhap', 6)]
Cluster id 132
	GPE: [('South Korea', 12), ('Statistics Korea', 5)]
	ORG: [('the Organization for Economic Cooperation and Development', 3), ('the statistics office', 2)]
	LOC: [('Asia', 5)]
	PERSON: [('Yonhap', 5)]
Cluster id 5034
	GPE: [("South Korea's", 6)]
	ORG: [('FSC', 7), ('The Financial Services Commission', 3), ('central bank data', 2), ('The Korea Housing Finance Corp.', 2), ('Financial Services Commission', 1), ('the Bank of Korea', 1), ('Chung-ang University', 1), ('Chungnam National University', 1), ('MBS', 1)]
	PERSON: [('Yonhap', 4), ('Shin Je-yoon', 2), ('Yim Jong-yong', 1), ('Yim', 1), ('Yeom Myung-bae', 1)]
	NORP: [('South Koreans', 1), ('South Korean', 1)]
	WORK_OF_ART: [('Park Chang-kyun', 1)]
Cluster id 5452
	GPE: [('Japan', 3