In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
### Reading data
data_dir = './data/'
filename_prefix = 'koreaherald_1517'
df0 = pd.read_json(os.path.join(data_dir, filename_prefix+'_0.json'))
df1 = pd.read_json(os.path.join(data_dir, filename_prefix+'_1.json'))
df2 = pd.read_json(os.path.join(data_dir, filename_prefix+'_2.json'))
df3 = pd.read_json(os.path.join(data_dir, filename_prefix+'_3.json'))
df4 = pd.read_json(os.path.join(data_dir, filename_prefix+'_4.json'))
df5 = pd.read_json(os.path.join(data_dir, filename_prefix+'_5.json'))
df6 = pd.read_json(os.path.join(data_dir, filename_prefix+'_6.json'))
df7 = pd.read_json(os.path.join(data_dir, filename_prefix+'_7.json'))
df = pd.concat([df0,df1,df2,df3,df4,df5,df6,df7])
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))

In [3]:
df

Unnamed: 0,index,title,author,time,description,body,section
0,0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs
1,1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
2,2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics
3,3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
4,4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea
...,...,...,...,...,...,...,...
23764,2765,N. Korean leader's speech arouses cautious opt...,KH디지털2,2015-01-01 13:36:00,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea
23765,2766,N. Korean leader open to inter-Korean summit t...,KH디지털2,2015-01-01 10:05:00,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea
23766,2767,Ex-U.S. envoy calls for clearer communication ...,KH디지털2,2015-01-01 09:27:00,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea
23767,2768,U.S. imposes sanctions on N. Korean firm,KH디지털2,2015-01-01 09:25:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea


In [4]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
US_WORD = "USA"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    "US": US_WORD,
    "USA": US_WORD,
    "U.S.A.": US_WORD,
    "United State": US_WORD,
    "N. Korea": NK_WORD,
    "S. Korea": SK_WORD
    
}

### Transform function
def transform_to_similar_sentence(s: str):
    new_str = s
    for key,value in similar_words.items():
        new_str = re.sub(key, value, new_str)
    return new_str

### Preprocess function for grouping similar topic
def preprocess_1(s: str):
    # Change similar words to the same word
    new_str = transform_to_similar_sentence(s)
    # Remove punctuation
    new_str = ''.join(ch if ch not in set(punctuation) else " " for ch in new_str)
    # Remove all single characters
    new_str = re.sub(r'\W', ' ', new_str)
    new_str = re.sub(r'\s+[a-zA-Z]\s+', ' ', new_str)
    new_str = re.sub(r'\^[a-zA-Z]\s+', ' ', new_str) 
    # Substituting multiple spaces with single space
    new_str = re.sub(r'\s+', ' ', new_str, flags=re.I)
    # Removing prefixed 'b' - when data is in bytes format
    new_str = re.sub(r'^b\s+', '', new_str)
    # Removing all numbers
    new_str = new_str.translate(str.maketrans('', '', digits))
    # Converting to Lowercase
    new_str = new_str.lower()
    # Lemmatization and remove stopwords
    new_str = new_str.split()
    stopwords = nltk.corpus.stopwords.words('english')
    new_str = [stemmer.lemmatize(word) for word in new_str if word not in stopwords]
    new_str = ' '.join(new_str)
    
    return new_str

In [5]:
df['preprocessed_body'] = df['body'].apply(lambda x: preprocess_1(x))

In [6]:
pd.set_option('display.max_colwidth', 2000)
df['preprocessed_body']

0                                                                                                                                                                                     birthrate persistently low senior population growing south korea working age population projected shrink onward backdrop steady inflow immigrant foreign resident potent factor could bring major change fabric south korean society long considered culturally ethnically homogeneous look multiculturalism growing hold although may visible yet mainstream migrant worker million foreign national residing south korea various visa scheme lion share totaling migrant worker brought china southeast asian country take job shunned educated south korean worker majority worker manufacturing noticeable rise agricultural fishery sector struggling chronic labor shortage tally statistic korea however leaf thousand undocumented foreign laborer living working without valid visa foreign wife since late international marriage risen 

In [7]:
### Make TF-IDF matrix
def tfidf_embed(documents, dimension):
    # documents: df['preprocessed_body']
    embeddings_dict = {}
    count = CountVectorizer()
    word_count=count.fit_transform(documents)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count)
    tfidf_vector=tfidf_transformer.transform(word_count)
    tfidf_feature_names = count.get_feature_names()
    
    # Dimensionality Reduction
    svd_word = TruncatedSVD(n_components=dimension, n_iter=3, random_state=42)
    svd_doc = TruncatedSVD(n_components=dimension, n_iter=3, random_state=42)
    tfidf_word_vector = svd_word.fit_transform(tfidf_vector.T)
    tfidf_doc_vector = svd_doc.fit_transform(tfidf_vector)
    for i in range(len(tfidf_feature_names)):
        name = tfidf_feature_names[i]
        embeddings_dict[name] = tfidf_word_vector[i,:]
        
    return embeddings_dict, tfidf_doc_vector

### Make GloVe matrix
glove_file = "../glove.42B.300d.txt"
def glove_embed():
    embeddings_dict = {}
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

# Average sum of word vectors
def sentence_embed(sentence, word_vectors, dimension):
    sum_vector = np.zeros(dimension)
    for w in sentence.split():
        if w in word_vectors:
            sum_vector += word_vectors[w]
    return sum_vector/len(sentence)

# Make document vector
def document_embed(documents, embedding_technique='tfidf', dimension=300):
    if embedding_technique=='tfidf':
        _, doc_vector = tfidf_embed(documents, dimension)
    elif embedding_technique=='glove':
        word_vector = glove_embed()
        doc_vector = [ sentence_embed(s, word_vector, dimension).tolist() for s in document ]
    
    return doc_vector

In [8]:
from datetime import datetime

# Days difference between two datetime
def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S")
    d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S")
    return abs((d2 - d1).days)

# Function returns number of article, number of distinct authors, section of the issue, length
def issue_indicator(news_index):
    num_article = len(news_index)
    num_author = len(df['author'][news_index].unique())
    section = 0
    length = days_between(df['time'][news_index].max(),df['time'][news_index].min())
    return num_article, num_author, section, length

In [9]:
doc_vectors = document_embed(df['preprocessed_body'], embedding_technique='tfidf', dimension=300)

In [None]:
### Clustering 
def document_clustering(documents, clustering_method='kmeans', embedding_technique='tfidf', dimension=300):
    doc_vectors = document_embed(documents, embedding_technique, dimension)
    if clustering_method=='kmeans':
        # Hyperparameters
        k_event = 10000
        k_issue = 6000
        
        # Clustering event
        kmeans_event = KMeans(n_clusters=k_event, random_state=69).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros(k_event, documents.shape[1])
        for i in range(k_event):
            event_vectors[i] = sum(doc_vectors[kmeans_event.labels_ == i])
        
        # Clustering issue
        kmeans_issue = KMeans(n_clusters=k_event, random_state=69).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros(k_issue, documents.shape[1])
        for i in range(k_issue):
            issue_vectors[i] = sum(event_vectors[kmeans_issue.labels_ == i])

        issue_labels = [ kmeans_issue.labels[kmeans_event.labels_[i]] for i in range(len(documents)) ]
        
        return issue_labels, kmeans_event.labels_
    
    elif clustering_method=='DBSCAN':
        pass
    
    else:
        assert("Doesn't support {}".format(clustering_method))   

In [40]:
### Rank issues based on significance factors
#         issues = [ np.argwhere(kmeans_issue.labels == j).reshape(-1) for j in range(k_issue) ]
#         events = [ np.argwhere(kmeans_event.labels == j).reshape(-1) for j in range(k_event) ]
        
#         issue_indicator = []
#         for i in range(k_issue):
#             related_news_index = []
#             for e in issues[i]:
#                 related_news_index += events[e]
#             issue_indicator.append((indicator_value(related_news_index), i))
        
#         issue_rank = sorted(issue_indicator, key = lambda k: (-k[0], -k[1], -k[2], -k[3]))[:10]
        
#         for i in issue_rank:
#             issue_index = i[4]
#             for event_index in issues[issue_index]:

1096