In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = './data/'
filename_prefix = 'koreaherald_1517'
df0 = pd.read_json(os.path.join(data_dir, filename_prefix+'_0.json'))
df1 = pd.read_json(os.path.join(data_dir, filename_prefix+'_1.json'))
df2 = pd.read_json(os.path.join(data_dir, filename_prefix+'_2.json'))
df3 = pd.read_json(os.path.join(data_dir, filename_prefix+'_3.json'))
df4 = pd.read_json(os.path.join(data_dir, filename_prefix+'_4.json'))
df5 = pd.read_json(os.path.join(data_dir, filename_prefix+'_5.json'))
df6 = pd.read_json(os.path.join(data_dir, filename_prefix+'_6.json'))
df7 = pd.read_json(os.path.join(data_dir, filename_prefix+'_7.json'))
df = pd.concat([df0,df1,df2,df3,df4,df5,df6,df7])
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))

In [3]:
df

Unnamed: 0,index,title,author,time,description,body,section
0,0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs
1,1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
2,2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics
3,3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
4,4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea
...,...,...,...,...,...,...,...
23764,2765,N. Korean leader's speech arouses cautious opt...,KH디지털2,2015-01-01 13:36:00,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea
23765,2766,N. Korean leader open to inter-Korean summit t...,KH디지털2,2015-01-01 10:05:00,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea
23766,2767,Ex-U.S. envoy calls for clearer communication ...,KH디지털2,2015-01-01 09:27:00,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea
23767,2768,U.S. imposes sanctions on N. Korean firm,KH디지털2,2015-01-01 09:25:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea


In [4]:
# Load large spacy model 
nlp = spacy.load('en_core_web_lg')

# Aggregate title and content
title_weight = 4
df['agg_title_body'] = title_weight*(df['title']+'. ') + df['body']

In [5]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def transform_to_similar_sentence(s: str):
    new_str = s
    for key,value in similar_words.items():
        new_str = re.sub(key, value, new_str)
    return new_str

# Using spacy to preprocess
def preprocess_spacy(s: str):
    # Change similar terms to the same term
    new_str = transform_to_similar_sentence(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    new_str = ' '.join([ token.lemma_.lower() for token in tokens ])
    return new_str, tokens, doc

### Preprocess function for grouping similar topic
def preprocess_manual(s: str):
    # Change similar words to the same word
    new_str = transform_to_similar_sentence(s)
    # Remove punctuation
    new_str = ''.join(ch if ch not in set(punctuation) else " " for ch in new_str)
    # Remove all single characters
    new_str = re.sub(r'\W', ' ', new_str)
    new_str = re.sub(r'\s+[a-zA-Z]\s+', ' ', new_str)
    new_str = re.sub(r'\^[a-zA-Z]\s+', ' ', new_str) 
    # Substituting multiple spaces with single space
    new_str = re.sub(r'\s+', ' ', new_str, flags=re.I)
    # Removing prefixed 'b' - when data is in bytes format
    new_str = re.sub(r'^b\s+', '', new_str)
    # Removing all numbers
    new_str = new_str.translate(str.maketrans('', '', digits))
    # Converting to Lowercase
    new_str = new_str.lower()
    # Lemmatization and remove stopwords
    new_str = new_str.split()
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [stemmer.lemmatize(word) for word in new_str if word not in stopwords]
    new_str = ' '.join(tokens)
    
    return new_str, tokens

In [6]:
preprocessed_data = df['agg_title_body'][:500].apply(lambda x: preprocess_spacy(x))
# df[['ppcm_title_body','ppcm_tokens']] = df['agg_title_body'].apply(lambda x: preprocess_manual(x))

In [7]:
'''
 Preprocessed_data[0]: new text
 Preprocessed_data[1]: token
 Preprocessed_data[2]: doc
'''
preprocessed_data[0]

('snapshot multiculturalism south korea snapshot multiculturalism south korea snapshot multiculturalism south korea snapshot multiculturalism south korea birthrate persistently low senior population grow south korea‘s work age population project shrink onward backdrop steady inflow immigrant foreign resident potent factor bring major change fabric south korean society long consider culturally ethnically homogeneous look multiculturalism grow hold visible mainstream migrant worker foreign national reside south korea visa scheme lion share total migrant worker bring china southeast asian country job shun educate south korean worker majority worker manufacturing noticeable rise agricultural fishery sector struggle chronic labor shortage tally statistics korea leave thousand undocumented foreign laborer live work valid visa foreign wife late 1990 international marriage rise rapidly major route immigration korea recent year pace growth moderate percent marriage involve non korean national r

In [8]:
pd.set_option('display.max_colwidth', 2000)
df['ppcs_title_body'] = pd.Series([preprocessed_data[i][0] for i in range(len(preprocessed_data))])
df['ppcs_title_body']

0                                                                                                                                                                                        snapshot multiculturalism south korea snapshot multiculturalism south korea snapshot multiculturalism south korea snapshot multiculturalism south korea birthrate persistently low senior population grow south korea‘s work age population project shrink onward backdrop steady inflow immigrant foreign resident potent factor bring major change fabric south korean society long consider culturally ethnically homogeneous look multiculturalism grow hold visible mainstream migrant worker foreign national reside south korea visa scheme lion share total migrant worker bring china southeast asian country job shun educate south korean worker majority worker manufacturing noticeable rise agricultural fishery sector struggle chronic labor shortage tally statistics korea leave thousand undocumented foreign laborer live w

In [9]:
### Make TF-IDF matrix
def tfidf_embed(documents, dimension):
    # documents: Doc class in spacy
    embeddings_dict = {}
    all_word = set()
    word_counter = [ dict() for i in range(len(documents)) ]
    for i in range(len(documents)):
        for tok in documents[i]:
            if tok.text not in word_counter[i]:
                word_counter[i][tok.text] = 1
            else:
                word_counter[i][tok.text] += 1
            if tok.text not in all_word:
                all_word.add(tok.text)
    word_count = []
    for i in range(len(documents)):
        l = []
        for w in all_word:
            if w not in word_counter[i]:
                l.append(0)
            else:
                l.append(word_counter[i][w])
        word_count.append(l)
    word_count = np.array(word_count)
#     count = CountVectorizer()
#     word_count=count.fit_transform(documents)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count)
    tfidf_vector=tfidf_transformer.transform(word_count)
    tfidf_feature_names = [ w for w in all_word ]
    
    # Dimensionality Reduction
    svd_word = TruncatedSVD(n_components=dimension, n_iter=3, random_state=42)
    svd_doc = TruncatedSVD(n_components=dimension, n_iter=3, random_state=42)
    tfidf_word_vector = svd_word.fit_transform(tfidf_vector.T)
    tfidf_doc_vector = svd_doc.fit_transform(tfidf_vector)
    for i in range(len(tfidf_feature_names)):
        name = tfidf_feature_names[i]
        embeddings_dict[name] = tfidf_word_vector[i,:]
        
    return embeddings_dict, tfidf_doc_vector

### Make GloVe matrix
glove_file = "../glove.42B.300d.txt"
def glove_embed():
    embeddings_dict = {}
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

# Average sum of word vectors
def sentence_embed(sentence, word_vectors, dimension):
    sum_vector = np.zeros(dimension)
    for w in sentence.split():
        if w in word_vectors:
            sum_vector += word_vectors[w]
    return sum_vector/len(sentence)

# Make document vector
def document_embed(documents, embedding_technique='tfidf', dimension=300):
    if embedding_technique=='tfidf':
        _, doc_vector = tfidf_embed(documents, dimension)
    elif embedding_technique=='glove':
        word_vector = glove_embed()
        doc_vector = [ sentence_embed(s, word_vector, dimension).tolist() for s in documents ]
    elif embedding_technique=='spacy':
        doc_vector = [doc.vector for doc in documents]
    
    return doc_vector

In [10]:
from datetime import datetime

# Days difference between two datetime
def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S")
    d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S")
    return abs((d2 - d1).days)

# Function returns number of article, number of distinct authors, section of the issue, length
def issue_indicator(news_index):
    num_article = len(news_index)
    num_author = len(df['author'][news_index].unique())
    section = 0
    length = days_between(df['time'][news_index].max(),df['time'][news_index].min())
    return num_article, num_author, section, length

In [11]:
### Clustering 
def document_clustering(doc_vectors, clustering_method='kmeans'):
    if clustering_method=='kmeans':
        # Hyperparameters
        k_event = 10000
        k_issue = 6000
        
        # Clustering event
        kmeans_event = KMeans(n_clusters=k_event, random_state=69).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((k_event, doc_vectors.shape[1]))
        for i in range(k_event):
            event_vectors[i] = sum(doc_vectors[kmeans_event.labels_ == i])
        
        # Clustering issue
        kmeans_issue = KMeans(n_clusters=k_issue, random_state=69).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((k_issue, doc_vectors.shape[1]))
        for i in range(k_issue):
            issue_vectors[i] = sum(event_vectors[kmeans_issue.labels_ == i])

        issue_labels = np.array([ kmeans_issue.labels_[kmeans_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return k_issue, k_event, issue_labels, kmeans_event.labels_
    
    elif clustering_method=='DBSCAN':
        eps = 0.1
        min_samples = 1
        
        pass
    
    else:
        assert("Doesn't support {}".format(clustering_method))   

In [None]:
### Embed document and clustering
### Doc class in spacy
preprocessed_docs = [preprocessed_data[i][2] for i in range(len(preprocessed_data))]
tfidf_doc_vectors = document_embed(preprocessed_docs, embedding_technique='tfidf', dimension=300)
# glove_doc_vectors = document_embed(df['ppcs_title_body'], embedding_technique='glove', dimension=300)

In [None]:
# Visualization
num_clusters = 200
pca_num_components = 2
tsne_num_components = 2

color_list = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(number_of_colors)]
labels_color_map = map(lambda (i,x): {i: x}, enumerate(color_list))

X = tfidf_doc_vectors.todense()
clustering_model = KMeans(
    n_clusters=num_clusters,
    max_iter=max_iterations,
    precompute_distances="auto",
    n_jobs=-1
)
labels = clustering_model.fit_predict(tfidf_doc_vectors)

reduced_data = PCA(n_components=pca_num_components).fit_transform(X)
fig, ax = plt.subplots()
for index, instance in enumerate(reduced_data):
    # print instance, index, labels[index]
    pca_comp_1, pca_comp_2 = reduced_data[index]
    color = labels_color_map[labels[index]]
    ax.scatter(pca_comp_1, pca_comp_2, c=color)
plt.show()

In [None]:
embeddings = TSNE(n_components=tsne_num_components)
Y = embeddings.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], cmap=plt.cm.Spectral)
plt.show()

In [None]:
neighbors = NearestNeighbors(n_neighbors=20)
neighbors_fit = neighbors.fit(X)
distances, indices = neighbors_fit.kneighbors(dataset)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

In [None]:
tfidf_num_issue, tfidf_num_event, tfidf_issue_labels, tfidf_event_labels = document_clustering(tfidf_doc_vectors, clustering_method='kmeans')

In [None]:
# Visualization


In [None]:
# Function returns number of article, number of distinct authors, section of the issue, length
def issue_indicator(news_index):
    num_article = len(news_index)
    num_author = len(df['author'][news_index].unique())
    section = 0
    length = days_between(df['time'][news_index].max(),df['time'][news_index].min())
    return num_article, num_author, section, length

### Rank issues based on significance factors
issue_significance = []
for issue_index in range(tfidf_num_issue):
    news_index = np.argwhere(tfidf_issue_labels == issue_index).reshape(-1)
    if len(news_index) > 0:
        issue_significance.append((issue_indicator(news_index),issue_index))
issue_significance = sorted(issue_significance, key=lambda k: (-k[0][0],-k[0][1],-k[0][3]))
top_10_issue = [ info[1] for info in issue_significance[:10]]

In [None]:
top_10_issue