In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher

from rake_nltk import Rake

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Modern\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Modern\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

# Aggregate title and content
title_weight = 1
df['agg_title_body'] = title_weight*(df['title']+'. ') + df['body']

df.shape

(23769, 7)

In [3]:
# Load large spacy model 
nlp = spacy.load('en_core_web_lg')

### Embed document and clustering
df2017 = df['2017' < df['time']]
df2016 = df[('2016' < df['time']) & (df['time'] < '2017')]
df2015 = df[('2015' < df['time']) & (df['time'] < '2016')]

In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens


In [5]:
### Make TF-IDF matrix
def tfidf_embed(documents, dimension=None):
    # documents: list of str
    # dim: integer
    embeddings_dict = {}
    tfidf_vectorizer = TfidfVectorizer(input='content', tokenizer=spacy_tokenizer)
    tfidf_vector = tfidf_vectorizer.fit_transform(documents)
    
    # Dimensionality Reduction
    if dimension is not None:
        svd_doc = TruncatedSVD(n_components=dimension, n_iter=5, random_state=42)
        tfidf_vector = svd_doc.fit_transform(tfidf_vector)
    return tfidf_vector

### Make GloVe matrix
glove_file = "../glove.42B.300d.txt"
def glove_word_vector():
    embeddings_dict = {}
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

# Average sum of word vectors
def sentence_embed(sentence, word_vectors, dimension):
    sum_vector = np.zeros(dimension)
    for w in sentence.split():
        if w in word_vectors:
            sum_vector += word_vectors[w]
    return sum_vector/len(sentence)

# Make document vector
def document_embed(documents, embedding_technique='tfidf', dimension=None):
    if embedding_technique=='tfidf':
        doc_vector = tfidf_embed(documents, dimension)
    elif embedding_technique=='glove':
        word_vector = glove_word_vector()
        if dimension is None:
            dimension = 300
        doc_vector = [ sentence_embed(s, word_vector, dimension).tolist() for s in documents ]
    elif embedding_technique=='spacy':
        doc_vector = [doc.vector for doc in documents]
    
    return doc_vector

In [6]:
### Clustering 
def document_clustering(doc_vectors, clustering_method='kmeans', evaluate=False):
    if clustering_method=='kmeans':
        # Hyperparameters
        k_event = 10000
        k_issue = 6000
        
        # Clustering event
        kmeans_event = KMeans(n_clusters=k_event, random_state=69).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((k_event, doc_vectors.shape[1]))
        for i in range(k_event):
            event_vectors[i] = sum(doc_vectors[kmeans_event.labels_ == i])
        
        # Clustering issue
        kmeans_issue = KMeans(n_clusters=k_issue, random_state=69).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((k_issue, doc_vectors.shape[1]))
        for i in range(k_issue):
            issue_vectors[i] = sum(event_vectors[kmeans_issue.labels_ == i])

        issue_labels = np.array([ kmeans_issue.labels_[kmeans_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return k_issue, k_event, issue_labels, kmeans_event.labels_
    
    elif clustering_method=='DBSCAN':
        
        # Hyperparameters
        doc_eps = 0.19
        doc_neighbors = 1
        event_eps = 0.50
        event_neighbors = 1
        
        '''
            Find best doc_eps and event_eps
        '''
        if evaluate:
            # Find best eps to group same document
            doc_eps_list = [ 0.10 + 0.001*i for i in range(1,301) ]
            doc_score = []
            doc_event = []
            doc_best_score = 0
            doc_best_eps = 0.0001
            for doc_eps in doc_eps_list:
                # Clustering event
                db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
                # Number of clusters in labels, ignoring noise if present.
                n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
                if len(set(db_event.labels_)) >= 2 and len(set(db_event.labels_)) <= len(doc_vectors)-1:
                    score_ = silhouette_score(doc_vectors, db_event.labels_)
                else:
                    score_ = -1
                doc_event.append(n_events_)
                doc_score.append(score_)
                if score_ > doc_best_score:
                    doc_best_score = score_
                    doc_best_eps = doc_eps
            print("Best Silhouete score is {} at eps: {} and number of events: {}".format(doc_best_score, doc_eps, n_events_))
            fig = plt.figure()
            plt.plot(doc_eps_list, doc_score)
            fig.suptitle('Doc eps and Silhouette score', fontsize=20)
            plt.xlabel('eps', fontsize=18)
            plt.ylabel('Silhouette score', fontsize=16)
            plt.show()
            
            fig = plt.figure()
            plt.plot(doc_eps_list, doc_event)
            fig.suptitle('Doc eps and number of events', fontsize=20)
            plt.xlabel('eps', fontsize=18)
            plt.ylabel('number of events', fontsize=16)
            plt.show()
            
            # Set doc_eps to the best value
            doc_eps = doc_best_eps
            # Find best eps to group same event
            # Clustering event
            db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
            n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
            event_labels = np.array(list(map(lambda x: n_events_ if x==-1 else x, db_event.labels_)))
            event_vectors = np.zeros((n_events_, doc_vectors.shape[1]))
            for i in range(n_events_+1):
                if np.sum(event_labels == i) != 0:
                    event_vectors[i] = np.sum(doc_vectors[event_labels == i], axis=0)/np.sum(event_labels == i)
            
            
#             # Clustering issue
#             event_eps_list = [ 0.2 + 0.001*i for i in range(1,401) ]
#             event_score = []
#             event_issue = []
#             event_best_score = 0
#             event_best_eps = 0.001
#             for event_eps in event_eps_list:
#                 db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
#                 # Number of clusters in labels, ignoring noise if present.
#                 n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
#                 if len(set(db_issue.labels_)) >= 2 and len(set(db_issue.labels_)) <= len(event_vectors)-1:
#                     score_ = silhouette_score(event_vectors, db_issue.labels_)
#                 else:
#                     score_ = -1
#                 event_issue.append(n_issues_)
#                 event_score.append(score_)
#                 if score_ > event_best_score:
#                     event_best_score = score_
#                     event_best_eps = event_eps
#             print("Best Silhouete score is {} at eps: {} and number of issues: {}".format(event_best_score, event_eps, n_issues_))
#             fig = plt.figure()
#             plt.plot(event_eps_list, event_score)
#             fig.suptitle('Event eps and Silhouette score', fontsize=20)
#             plt.xlabel('eps', fontsize=18)
#             plt.ylabel('Silhouette score', fontsize=16)
#             plt.show()
            
#             fig = plt.figure()
#             plt.plot(event_eps_list, event_issue)
#             fig.suptitle('Event eps and number of issues', fontsize=20)
#             plt.xlabel('eps', fontsize=18)
#             plt.ylabel('number of issues', fontsize=16)
#             plt.show()
            
            # Set event_eps to best value
            event_eps = 0.5
            # Clustering issue
            db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
            n_noise_ = list(db_issue.labels_).count(-1)
            # Represent each issue by average sum of related news
            issue_labels = np.array(list(map(lambda x: n_issues_ if x==-1 else x, db_issue.labels_)))
            issue_vectors = np.zeros((n_issues_, doc_vectors.shape[1]))
            for i in range(n_issues_+1):
                if np.sum(issue_labels == i) != 0:
                    issue_vectors[i] = np.sum(event_vectors[issue_labels == i], axis=0)/np.sum(issue_labels == i)
       
            issue_labels = np.array([ issue_labels[event_labels[i]] for i in range(doc_vectors.shape[0]) ])
        
        else:
            '''
            Clustering using specific value
            '''
            # Clustering event
            db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
            n_noise_ = list(db_event.labels_).count(-1)
            print("1st cluster:\n\tThe number of cluster is {}".format(n_events_))
            # Represent each event by average sum of related news
            event_labels = np.array(list(map(lambda x: n_events_ if x==-1 else x, db_event.labels_)))
            event_vectors = np.zeros((n_events_, doc_vectors.shape[1]))
            for i in range(n_events_+1):
                if np.sum(event_labels == i) != 0:
                    event_vectors[i] = np.sum(doc_vectors[event_labels == i], axis=0)/np.sum(event_labels == i)

            # Clustering issue
            db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
            n_noise_ = list(db_issue.labels_).count(-1)
            print("2nd cluster:\n\tThe number of cluster is {}".format(n_issues_))
            # Represent each issue by average sum of related news
            issue_labels = np.array(list(map(lambda x: n_issues_ if x==-1 else x, db_issue.labels_)))
            issue_vectors = np.zeros((n_issues_, doc_vectors.shape[1]))
            for i in range(n_issues_+1):
                if np.sum(issue_labels == i) != 0:
                    issue_vectors[i] = np.sum(event_vectors[issue_labels == i], axis=0)/np.sum(issue_labels == i)
        
            issue_labels = np.array([ issue_labels[event_labels[i]] for i in range(doc_vectors.shape[0]) ])
        
        return n_issues_, n_events_, issue_labels, event_labels
    
    elif clustering_method=='agglomerative':
        # Hyperparameters
        n_events = 10000
        n_issues = 6000
        
        # Clustering event
        agg_event = AgglomerativeClustering(distance_threshold=0, n_clusters=n_events).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((n_events, doc_vectors.shape[1]))
        for i in range(n_events):
            event_vectors[i] = sum(doc_vectors[agg_event.labels_ == i])
        
        plt.title("Hierarchical Clustering Dendrogram")
        # plot the top three levels of the dendrogram
        plot_dendrogram(agg_event, truncate_mode="level", p=3)
        plt.xlabel("Number of points in node (or index of point if no parenthesis).")
        plt.show()
        
        # Clustering issue
        agg_issue = AgglomerativeClustering(distance_threshold=0, n_clusters=n_issues).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((n_issues, doc_vectors.shape[1]))
        for i in range(n_issues):
            issue_vectors[i] = sum(event_vectors[agg_issue.labels_ == i])

        issue_labels = np.array([ agg_issue.labels_[agg_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return agg_issue, agg_event, issue_labels, agg_event.labels_
    
    elif clustering_method=='LDA':
        
        pass
    
    else:
        assert("Doesn't support {}".format(clustering_method))   

In [13]:
# tfidf_doc2017_vectors = document_embed(df2017['agg_title_body'], embedding_technique='tfidf', dimension=300)
# tfidf_doc2016_vectors = document_embed(df2016['agg_title_body'], embedding_technique='tfidf', dimension=300)
# tfidf_doc2015_vectors = document_embed(df2015['agg_title_body'], embedding_technique='tfidf', dimension=300)
# joblib.dump(tfidf_doc2017_vectors,'../data/tfidf_titlebody_2017.csv')
# joblib.dump(tfidf_doc2016_vectors,'../data/tfidf_titlebody_2016.csv')
# joblib.dump(tfidf_doc2015_vectors,'../data/tfidf_titlebody_2015.csv')

['../data/tfidf_titlebody_2015.csv']

In [14]:
# Load data
tfidf_doc2017_vectors = joblib.load('../data/tfidf_titlebody_2017.csv')
tfidf_doc2016_vectors = joblib.load('../data/tfidf_titlebody_2016.csv')
tfidf_doc2015_vectors = joblib.load('../data/tfidf_titlebody_2015.csv')

In [15]:
tfidf_doc2015_num_issue, tfidf_doc2015_num_event, tfidf_doc2015_issue_labels, tfidf_doc2015_event_labels = document_clustering(tfidf_doc2015_vectors, clustering_method='DBSCAN', evaluate=False)

1st cluster:
	The number of cluster is 6919
2nd cluster:
	The number of cluster is 1190


In [16]:
tfidf_doc2016_num_issue, tfidf_doc2016_num_event, tfidf_doc2016_issue_labels, tfidf_doc2016_event_labels = document_clustering(tfidf_doc2016_vectors, clustering_method='DBSCAN', evaluate=False)

1st cluster:
	The number of cluster is 7335
2nd cluster:
	The number of cluster is 1264


In [17]:
tfidf_doc2017_num_issue, tfidf_doc2017_num_event, tfidf_doc2017_issue_labels, tfidf_doc2017_event_labels = document_clustering(tfidf_doc2017_vectors, clustering_method='DBSCAN', evaluate=False)

1st cluster:
	The number of cluster is 9000
2nd cluster:
	The number of cluster is 1547


In [18]:
print("Number of document in 2017: {} / {} clusters".format(len(tfidf_doc2017_vectors), tfidf_doc2017_num_event))
print("Number of document in 2016: {} / {} clusters".format(len(tfidf_doc2016_vectors), tfidf_doc2016_num_event))
print("Number of document in 2015: {} / {} clusters".format(len(tfidf_doc2015_vectors), tfidf_doc2015_num_event))

Number of document in 2017: 9128 / 9000 clusters
Number of document in 2016: 7485 / 7335 clusters
Number of document in 2015: 7156 / 6919 clusters


In [19]:
from collections import Counter
counter_2017_event = Counter(tfidf_doc2017_event_labels)
counter_2016_event = Counter(tfidf_doc2016_event_labels)
counter_2015_event = Counter(tfidf_doc2015_event_labels)

In [20]:
top10_event_2017 = counter_2017_event.most_common(10)
top10_event_2016 = counter_2016_event.most_common(10)
top10_event_2015 = counter_2015_event.most_common(10)

In [23]:
import yake
kw_extractor = yake.KeywordExtractor()
language = "en"
max_ngram_size = 4
deduplication_threshold = 0.9
numOfKeywords = 5

In [24]:
for index, (event_id, num_event) in enumerate(top10_event_2017):
    text_list = df2017[tfidf_doc2017_event_labels == event_id]['agg_title_body'].tolist()
    tokens = spacy_tokenizer(' '.join(text_list))
    text = ' '.join(tokens)
    print("2017: event {} with total {} events".format(event_id, num_event))
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords:
        print(kw)

2017: event 67 with total 11 events
('range c c return', -0.03593167801496952)
('range c c. yonhap', -0.023620655108837686)
('minus c c. freezing', -0.012916800744155098)
('minus c c low', -0.009976584174158016)
('minus c c cold', -0.006944431600824337)
2017: event 617 with total 7 events
('brown long eared bat', 1.3080837792958838e-06)
('skin impression sauropod dinosaur', 4.130366357793372e-06)
('impression sauropod dinosaur footprint', 4.731878946092821e-06)
('large skin impression sauropod', 5.119234751513881e-06)
('long eared bat find', 6.4363671707718436e-06)
2017: event 3708 with total 5 events
('percent year early report', 0.0004503729419082002)
('year early report show', 0.0005080550969691897)
('percent year early number', 0.0005625446324858537)
('percent tally year early', 0.0005781321142472548)
('year early accord datum', 0.000682364252266683)
2017: event 4735 with total 5 events
('group south korean scientist', 1.639719649119828e-06)
('south korean scientist develop', 2.395

In [25]:
for index, (event_id, num_event) in enumerate(top10_event_2016):
    text_list = df2016[tfidf_doc2016_event_labels == event_id]['agg_title_body'].tolist()
    tokens = spacy_tokenizer(' '.join(text_list))
    text = ' '.join(tokens)
    print("2016: event {} with total {} events".format(event_id, num_event))
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords:
        print(kw)

2016: event 16 with total 5 events
('north korea state radio', 0.0015266477311652229)
('resume broadcast mysterious number', 0.001565467269316015)
('resume encrypt number broadcast', 0.0016716454876568127)
('spy operate south korea', 0.0016796182808237845)
('korea state radio station', 0.001739900331525132)
2016: event 518 with total 4 events
('report possible ai outbreak', -5.387900363418817)
('highly pathogenic aviation influenza', 0.0015295464328062567)
('haenam south jeolla province', 0.001744719081315853)
('chicken farm haenam south', 0.001909884689482229)
('highly pathogenic bird flu', 0.0020454707622211575)
2016: event 5095 with total 4 events
('turnout usually mean young', -0.04279908574965713)
('election voter turnout record', 3.506989294504152e-06)
('general election voter turnout', 3.6177306509450238e-06)
('voter turnout general election', 3.6177306509450246e-06)
('turnout general election estimate', 4.561081686934742e-06)
2016: event 431 with total 3 events
('seong ju tell 

In [26]:
for index, (event_id, num_event) in enumerate(top10_event_2015):
    text_list = df2015[tfidf_doc2015_event_labels == event_id]['agg_title_body'].tolist()
    tokens = spacy_tokenizer(' '.join(text_list))
    text = ' '.join(tokens)
    print("2015: event {} with total {} events".format(event_id, num_event))
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords:
        print(kw)

2015: event 2957 with total 6 events
('country see new case', -0.010565646044537904)
('rate percent south korea', 1.6534229134971894e-06)
('rate stand percent people', 1.6968917291301697e-06)
('day disease diagnose mers', 1.90129440690986e-06)
('percent south korea fatality', 1.916076676240595e-06)
2015: event 132 with total 5 events
('percent year follow percent', 0.00013664755882343462)
('bear percent month year', 0.00017117455907262021)
('percent year drop follow', 0.00018955922174981044)
('year drop follow percent', 0.00018955922174981044)
('drop percent year follow', 0.00018955922174981044)
2015: event 5039 with total 4 events
('positive way say financial', -0.36843413479517373)
('fsc say new loan', -0.22535548844226266)
('commission say new loan', -0.22215715284915521)
('long term fix rate', 3.9005679739970884e-05)
('term fix rate loan', 4.631372751020724e-05)
2015: event 5457 with total 4 events
('apology sex slavery issue', 0.0003796920941752618)
('talk sex slavery major', 0.00