In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation

from tqdm import tqdm
import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher

from rake_nltk import Rake

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

df.shape

(23769, 6)

In [6]:
nlp = spacy.load('en_core_web_lg')

In [7]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

# Using spacy to preprocess
def preprocess_spacy(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    new_str = ' '.join([ token.lemma_.lower() for token in tokens ])
    return new_str, tokens, doc

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens

### Preprocess function for grouping similar topic
def preprocess_manual(s: str):
    # Change similar words to the same word
    new_str = transform_to_similar_sentence(s)
    # Remove punctuation
    new_str = ''.join(ch if ch not in set(punctuation) else " " for ch in new_str)
    # Remove all single characters
    new_str = re.sub(r'\W', ' ', new_str)
    new_str = re.sub(r'\s+[a-zA-Z]\s+', ' ', new_str)
    new_str = re.sub(r'\^[a-zA-Z]\s+', ' ', new_str) 
    # Substituting multiple spaces with single space
    new_str = re.sub(r'\s+', ' ', new_str, flags=re.I)
    # Removing prefixed 'b' - when data is in bytes format
    new_str = re.sub(r'^b\s+', '', new_str)
    # Removing all numbers
    new_str = new_str.translate(str.maketrans('', '', digits))
    # Converting to Lowercase
    new_str = new_str.lower()
    # Lemmatization and remove stopwords
    new_str = new_str.split()
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [stemmer.lemmatize(word) for word in new_str if word not in stopwords]
    new_str = ' '.join(tokens)
    
    return new_str, tokens

In [8]:
### Clustering 
def document_clustering(doc_vectors, clustering_method='kmeans', evaluate=False):
    if clustering_method=='kmeans':
        # Hyperparameters
        k_event = 10000
        k_issue = 6000
        
        # Clustering event
        kmeans_event = KMeans(n_clusters=k_event, random_state=69).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((k_event, doc_vectors.shape[1]))
        for i in range(k_event):
            event_vectors[i] = sum(doc_vectors[kmeans_event.labels_ == i])
        
        # Clustering issue
        kmeans_issue = KMeans(n_clusters=k_issue, random_state=69).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((k_issue, doc_vectors.shape[1]))
        for i in range(k_issue):
            issue_vectors[i] = sum(event_vectors[kmeans_issue.labels_ == i])

        issue_labels = np.array([ kmeans_issue.labels_[kmeans_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return k_issue, k_event, issue_labels, kmeans_event.labels_
    
    elif clustering_method=='DBSCAN':
        
        # Hyperparameters
        doc_eps = 0.19
        doc_neighbors = 1
        event_eps = 0.50
        event_neighbors = 1
        
        '''
            Find best doc_eps and event_eps
        '''
        if evaluate:
            # Find best eps to group same document
            doc_eps_list = [ 0.10 + 0.001*i for i in range(1,301) ]
            doc_score = []
            doc_event = []
            doc_best_score = 0
            doc_best_eps = 0.0001
            for doc_eps in doc_eps_list:
                # Clustering event
                db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
                # Number of clusters in labels, ignoring noise if present.
                n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
                if len(set(db_event.labels_)) >= 2 and len(set(db_event.labels_)) <= len(doc_vectors)-1:
                    score_ = silhouette_score(doc_vectors, db_event.labels_)
                else:
                    score_ = -1
                doc_event.append(n_events_)
                doc_score.append(score_)
                if score_ > doc_best_score:
                    doc_best_score = score_
                    doc_best_eps = doc_eps
            print("Best Silhouete score is {} at eps: {} and number of events: {}".format(doc_best_score, doc_eps, n_events_))
            fig = plt.figure()
            plt.plot(doc_eps_list, doc_score)
            fig.suptitle('Doc eps and Silhouette score', fontsize=20)
            plt.xlabel('eps', fontsize=18)
            plt.ylabel('Silhouette score', fontsize=16)
            plt.show()
            
            fig = plt.figure()
            plt.plot(doc_eps_list, doc_event)
            fig.suptitle('Doc eps and number of events', fontsize=20)
            plt.xlabel('eps', fontsize=18)
            plt.ylabel('number of events', fontsize=16)
            plt.show()
            
            # Set doc_eps to the best value
            doc_eps = doc_best_eps
            # Find best eps to group same event
            # Clustering event
            db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
            n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
            event_labels = np.array(list(map(lambda x: n_events_ if x==-1 else x, db_event.labels_)))
            event_vectors = np.zeros((n_events_, doc_vectors.shape[1]))
            for i in range(n_events_+1):
                if np.sum(event_labels == i) != 0:
                    event_vectors[i] = np.sum(doc_vectors[event_labels == i], axis=0)/np.sum(event_labels == i)
            
            
#             # Clustering issue
#             event_eps_list = [ 0.2 + 0.001*i for i in range(1,401) ]
#             event_score = []
#             event_issue = []
#             event_best_score = 0
#             event_best_eps = 0.001
#             for event_eps in event_eps_list:
#                 db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
#                 # Number of clusters in labels, ignoring noise if present.
#                 n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
#                 if len(set(db_issue.labels_)) >= 2 and len(set(db_issue.labels_)) <= len(event_vectors)-1:
#                     score_ = silhouette_score(event_vectors, db_issue.labels_)
#                 else:
#                     score_ = -1
#                 event_issue.append(n_issues_)
#                 event_score.append(score_)
#                 if score_ > event_best_score:
#                     event_best_score = score_
#                     event_best_eps = event_eps
#             print("Best Silhouete score is {} at eps: {} and number of issues: {}".format(event_best_score, event_eps, n_issues_))
#             fig = plt.figure()
#             plt.plot(event_eps_list, event_score)
#             fig.suptitle('Event eps and Silhouette score', fontsize=20)
#             plt.xlabel('eps', fontsize=18)
#             plt.ylabel('Silhouette score', fontsize=16)
#             plt.show()
            
#             fig = plt.figure()
#             plt.plot(event_eps_list, event_issue)
#             fig.suptitle('Event eps and number of issues', fontsize=20)
#             plt.xlabel('eps', fontsize=18)
#             plt.ylabel('number of issues', fontsize=16)
#             plt.show()
            
            # Set event_eps to best value
            event_eps = 0.5
            # Clustering issue
            db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
            n_noise_ = list(db_issue.labels_).count(-1)
            # Represent each issue by average sum of related news
            issue_labels = np.array(list(map(lambda x: n_issues_ if x==-1 else x, db_issue.labels_)))
            issue_vectors = np.zeros((n_issues_, doc_vectors.shape[1]))
            for i in range(n_issues_+1):
                if np.sum(issue_labels == i) != 0:
                    issue_vectors[i] = np.sum(event_vectors[issue_labels == i], axis=0)/np.sum(issue_labels == i)
       
            issue_labels = np.array([ issue_labels[event_labels[i]] for i in range(doc_vectors.shape[0]) ])
        
        else:
            '''
            Clustering using specific value
            '''
            # Clustering event
            db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
            n_noise_ = list(db_event.labels_).count(-1)
            print("1st cluster:\n\tThe number of cluster is {}".format(n_events_))
            # Represent each event by average sum of related news
            event_labels = np.array(list(map(lambda x: n_events_ if x==-1 else x, db_event.labels_)))
            event_vectors = np.zeros((n_events_, doc_vectors.shape[1]))
            for i in range(n_events_+1):
                if np.sum(event_labels == i) != 0:
                    event_vectors[i] = np.sum(doc_vectors[event_labels == i], axis=0)/np.sum(event_labels == i)

            # Clustering issue
            db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
            n_noise_ = list(db_issue.labels_).count(-1)
            print("2nd cluster:\n\tThe number of cluster is {}".format(n_issues_))
            # Represent each issue by average sum of related news
            issue_labels = np.array(list(map(lambda x: n_issues_ if x==-1 else x, db_issue.labels_)))
            issue_vectors = np.zeros((n_issues_, doc_vectors.shape[1]))
            for i in range(n_issues_+1):
                if np.sum(issue_labels == i) != 0:
                    issue_vectors[i] = np.sum(event_vectors[issue_labels == i], axis=0)/np.sum(issue_labels == i)
        
            issue_labels = np.array([ issue_labels[event_labels[i]] for i in range(doc_vectors.shape[0]) ])
        
        return n_issues_, n_events_, issue_labels, event_labels
    
    elif clustering_method=='agglomerative':
        # Hyperparameters
        n_events = 10000
        n_issues = 6000
        
        # Clustering event
        agg_event = AgglomerativeClustering(distance_threshold=0, n_clusters=n_events).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((n_events, doc_vectors.shape[1]))
        for i in range(n_events):
            event_vectors[i] = sum(doc_vectors[agg_event.labels_ == i])
        
        plt.title("Hierarchical Clustering Dendrogram")
        # plot the top three levels of the dendrogram
        plot_dendrogram(agg_event, truncate_mode="level", p=3)
        plt.xlabel("Number of points in node (or index of point if no parenthesis).")
        plt.show()
        
        # Clustering issue
        agg_issue = AgglomerativeClustering(distance_threshold=0, n_clusters=n_issues).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((n_issues, doc_vectors.shape[1]))
        for i in range(n_issues):
            issue_vectors[i] = sum(event_vectors[agg_issue.labels_ == i])

        issue_labels = np.array([ agg_issue.labels_[agg_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return agg_issue, agg_event, issue_labels, agg_event.labels_
    
    elif clustering_method=='LDA':
        
        pass
    
    else:
        assert("Doesn't support {}".format(clustering_method))   

In [60]:
# # Find all doc contains "North Korea"
# vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)
# data_vectorized = vectorizer.fit_transform(df['body'])
# joblib.dump(vectorizer, '../data/vectorizer.csv')
# joblib.dump(data_vectorized, '../data/data_vectorized.csv')

['../data/data_vectorized.csv']

In [61]:
vectorizer = joblib.load('../data/vectorizer.csv')
data_vectorized = joblib.load('../data/data_vectorized.csv')

In [64]:
nk_index = np.argwhere(vectorizer.get_feature_names_out() == 'north korea')[0,0]
nk_doc_index = [ i for i,j in np.argwhere(X[:, nk_index]>0)]
nk_df = df.iloc[nk_doc_index].reset_index()

In [68]:
nk_df

Unnamed: 0,index,title,author,time,description,body,section
0,1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
1,3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
2,4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea
3,7,Secret Sauce? Kim Jong-un applies science to k...,AP,2017-12-30 12:10:00,Kim Jong Un wants to turn the art of kimchi-ma...,Kim Jong Un wants to turn the art of kimchi-ma...,North Korea
4,8,N. Korea says there will be no change to its n...,Yonhap,2017-12-30 10:31:00,North Korea will continue to enhance its nucle...,North Korea will continue to enhance its nucle...,North Korea
...,...,...,...,...,...,...,...
9103,23760,‘Responsibility to protect does not apply to N...,Korea Herald,2015-01-01 21:21:00,This is the second installment in a special Ne...,This is the second installment in a special Ne...,Defense
9104,23764,N. Korean leader's speech arouses cautious opt...,KH디지털2,2015-01-01 13:36:00,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea
9105,23766,Ex-U.S. envoy calls for clearer communication ...,KH디지털2,2015-01-01 09:27:00,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea
9106,23767,U.S. imposes sanctions on N. Korean firm,KH디지털2,2015-01-01 09:25:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea


In [33]:
# lda = LatentDirichletAllocation(n_components=50, random_state=0)
# lda.fit(data_vectorized)
# joblib.dump(lda, '../data/lda.csv')

['../data/lda.csv']

In [37]:
lda = joblib.load('../data/lda.csv')

In [34]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [35]:
print_top_words(lda, vectorizer, n_top_words=25)


Topic #0: police say old child man victim find suspect kim death year family case woman court mother take report father abuse lee kill arrest body die

Topic #1: korean soldier military say north south border army line fire official north korea south korea guard near area troop zone time take dmz command staff cross accord

Topic #2: north north korea sanction nuclear test resolution say council u.n pyongyang security international korean missile launch china un ban rocket country new long report range adopt

Topic #3: korean south say ministry country foreign chinese south korea koreans refugee official government yonhap people china group visa fishing boat year accord national international authority war

Topic #4: minister ministry foreign meeting seoul south korea official talk hold defense say discuss korean cooperation country vice issue security yonhap visit deputy south counterpart affair bilateral

Topic #5: trump nuclear say president south korea donald earthquake iran count



In [None]:
'''
LDA 50 topics
Topic 0: Tragedy
Topic 1: Soldier
Topic 2: nuclear / missile
Topic 3: ?
Topic 4: ?
Topic 5: war 
Topic 6: Travel?
Topic 7: Sex violence
Topic 8: ?
Topic 9: ?
Topic 10: ?
Topic 11: ?
Topic 12: Olympics game
Topic 13: ?
Topic 14: Army
Topic 15: Dokdo
Topic 16: 
Topic 17: Moon and Park
Topic 33: missile nuclear
Topic 34: airplane
Topic 35: President
Topic 36: misile nuclear
Topic 41: science technology
Topic 46: disease
Topic 47: blood donate
Topic 48: Japanese and sex victim
Topic 49: Worker, job market
'''

In [70]:
# nk_doc_topic_dist = pd.DataFrame(lda.transform(data_vectorized)).iloc[nk_doc_index]
# nk_doc_topic_dist.to_csv('../data/nk_doc_topic_dist.csv', index=False)

KeyboardInterrupt: 

In [75]:
nk_doc_topic_dist = pd.read_csv('../data/nk_doc_topic_dist.csv')

In [None]:
def get_k_nearest_docs(doc_dist, k=5, get_dist=False):
    '''
    doc_dist: topic distribution (sums to 1) of one article
    
    Returns the index of the k nearest articles (as by Jensen–Shannon divergence in topic space). 
    '''
    
    temp = nk_doc_topic_dist
         
    distances = temp.apply(lambda x: jensenshannon(x, doc_dist), axis=1)
    k_nearest = distances[distances != 0].nsmallest(n=k).index
    
    if get_dist:
        k_distances = distances[distances != 0].nsmallest(n=k)
        return k_nearest, k_distances
    else:
        return k_nearest

In [86]:
def most_related_docs(topic_index, num_docs=5):
    sorted_doc = nk_doc_topic_dist.sort_values(by=['{}'.format(topic_index)], ascending=False)
    return nk_df.iloc[sorted_doc[:num_docs].index]

In [88]:
# Nuclear, missile
most_related_docs(2)['title'].tolist()

['China vows to ensure full implementation of U.N. sanctions against N.K,',
 'China orders North Korean firms to close down',
 "N. Korea's nuclear test site shows 'high-level activity': 38 North",
 'EU tightens sanctions on N. Korea in line with UN resolution',
 "U.N. ban on mineral exports to hurt North Korea's revenue: Seoul"]

In [91]:
# Nuclear, missile
most_related_docs(33)['title'].tolist()

['NK’s latest missile a ‘new type’: S. Korean military',
 'Two NK missiles on mobile launchers could either be Rodong or part of ICBMs: US expert',
 "Seoul confirms N. Korea's push to develop solid-fuel rockets",
 'N. Korea may be trying to develop new medium-range solid-fuel missile',
 "N. Korea's new missile could be deployable by 2018: expert"]

In [92]:
# Nuclear, missile
most_related_docs(36)['title'].tolist()

["Seoul: N. Korea's provocation will only deepen its isolation",
 "NK vows to take 'toughest' military actions as US sends aircraft carrier",
 "U.S. assures 'ironclad' commitment to defending Korea",
 'Seoul, Washington defense chiefs agree to strengthen deployment of US strategic assets',
 'S. Korea condemns NK‘s missile provocation, steps up diplomatic drive']

In [89]:
# Sex violence
most_related_docs(7)['title'].tolist()

['S. Korea repeals anti-cheating law',
 'Expats decry military for gay sex conviction',
 'Korean military court convicts soldier over gay sex',
 '[Herald Interview] The price of faith for conscientious objectors',
 '[Election 2017] Gender biased language backfires on campaign trail']

In [93]:
# Olympics game
most_related_docs(12)['title'].tolist()

["PyeongChang 'ready to welcome the world' at next Winter Olympics: IOC",
 'N. Korean IOC member keeps mum on Olympic co-hosting with S. Korea',
 'Moon says sports can create peace, invites N. Korea to PyeongChang Olympics',
 "'NK submitted document for PyeongChang Paralympics participation'",
 'North Korean IOC member says joint Korean team at PyeongChang 2018 may be difficult']

In [94]:
# Science and Technology
most_related_docs(41)['title'].tolist()

['Civic groups demand U.S. apology for anthrax delivery',
 'USFK vows transparency in bio defense training in S. Korea',
 "U.S. calls for enhancing defense against 'very real' biological weapons threats from N. Korea",
 '‘USFK conducted 16 covert anthrax tests since 2009’',
 'S. Korea, U.S. agree to on-site probe of Osan base next week over anthrax shipment']

In [95]:
# Disease
most_related_docs(46)['title'].tolist()

['N. Korea claims to have developed panacea for MERS',
 'North Korea ends preventive steps against MERS virus',
 'NK soldier suffering from pneumonia and blood poisoning: report',
 'Number of malaria patients has dropped steadily in recent years: data',
 "Gov't to allow blood drives in malaria-prone areas to cope with low reserves"]

In [49]:
df_topic2_5docs.index.tolist()

[12784, 2115, 12168, 1938, 15354]