In [1]:
import numpy as np 
import pandas as pd
import os
import re

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import spacy
from spacy.matcher import Matcher

from scipy.spatial.distance import jensenshannon

import joblib

from IPython.display import HTML, display

from ipywidgets import interact, Layout, HBox, VBox, Box
import ipywidgets as widgets
from IPython.display import clear_output

from tqdm import tqdm
from os.path import isfile

import unicodedata
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

# Load preprocessed data
df['summarized_body'] = pd.read_csv('summarized_data.csv')

df.shape

(23769, 7)

In [3]:
nlp = spacy.load('en_core_web_lg')

### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

### Transform function
def text_cleaning(s: str):
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens

In [4]:
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, min_df=2)
all_texts = df['title']
data_vectorized = vectorizer.fit_transform(tqdm(all_texts))

# most frequent words
word_count = pd.DataFrame({'word': vectorizer.get_feature_names(), 'count': np.asarray(data_vectorized.sum(axis=0))[0]})

word_count.sort_values('count', ascending=False).set_index('word')[:20].sort_values('count', ascending=True).plot(kind='barh')

  6%|████▋                                                                         | 1434/23769 [00:19<04:56, 75.38it/s]


KeyboardInterrupt: 

In [None]:
joblib.dump(vectorizer, 'vectorizer_title.csv')
joblib.dump(data_vectorized, 'title_vectorized.csv')

In [5]:
# Load doc vector (Count Vector)
vectorizer = joblib.load('vectorizer.csv')
data_vectorized = joblib.load('data_vectorized.csv')

In [25]:
lda2015 = LatentDirichletAllocation(n_components=100, random_state=0)
lda2015.fit(data_vectorized[ (df['time'] < '2016') & (df['time']>'2015')])
joblib.dump(lda2015, 'lda_title_2015.csv')

lda2016 = LatentDirichletAllocation(n_components=100, random_state=0)
lda2016.fit(data_vectorized[(df['time'] < '2017') & (df['time']>'2016')])
joblib.dump(lda2016, 'lda_title_2016.csv')

lda2017 = LatentDirichletAllocation(n_components=100, random_state=0)
lda2017.fit(data_vectorized[(df['time'] < '2018') & (df['time']>'2017')])
joblib.dump(lda2017, 'lda_title_2017.csv')

['lda_title_2017.csv']

In [26]:
# Load model LDA
lda2015 = joblib.load('lda_title_2015.csv')
lda2016 = joblib.load('lda_title_2016.csv')
lda2017 = joblib.load('lda_title_2017.csv') 

In [27]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [28]:
print_top_words(lda2015, vectorizer, n_top_words=25)


Topic #0: wage worker percent pay job $ park minimum korean north march approval employee company raise government won firm week poll month tax rating approve seoul

Topic #1: china state u.s secretary say dispute beijing kerry russel sea assistant john official washington call territorial south korea daniel south change run international tension security resolution

Topic #2: say korean north korea report shin royce foreign u.n special rep commission dong ed marzuki darusman inquiry south north commander right holiday hyuk introduce relevant detail

Topic #3: labor reform government public pension market system push say civil plan servant management union service business peak call deal measure sector committee ruling come thursday

Topic #4: kim north jong leader un korean say young il official late visit north korea sung yong power military execute take pyongyang agency party country hyon year

Topic #5: park president hye geun korean call south korea south say change leader countr

In [None]:
doc_topic_dist = pd.DataFrame(lda.transform(data_vectorized))
doc_topic_dist.to_csv('doc_topic_dist.csv', index=False)

In [None]:
def get_k_nearest_docs(doc_dist, year, k=5, get_dist=False):
    '''
    doc_dist: topic distribution (sums to 1) of one article
    
    Returns the index of the k nearest articles (as by Jensen–Shannon divergence in topic space). 
    '''
    
    temp = doc_topic_dist[df['date'] < year & df['date'] > year-1]
         
    distances = temp.apply(lambda x: jensenshannon(x, doc_dist), axis=1)
    k_nearest = distances[distances != 0].nsmallest(n=k).index
    
    if get_dist:
        k_distances = distances[distances != 0].nsmallest(n=k)
        return k_nearest, k_distances
    else:
        return k_nearest

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda.perplexity(data_vectorized))

# See model parameters
print(lda.get_params())