### Libs & Data

In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

import spacy
from spacy.matcher import Matcher

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kwsst\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kwsst\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("data/data_sum.csv")
df.shape

(23769, 7)

In [3]:
df.head()

Unnamed: 0,title,author,time,description,body,section,summarized_body
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs,"As of 2016, more than 2 million foreign nation..."
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,The corruption scandal that broke out in 2016 ...
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics,"Ahn Cheol-soo, leader of the center-left Peopl..."
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea,"The 5,100-ton KOTI is being held in the wester..."
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea,The crew of a Hong Kong-registered ship have b...


In [31]:
# Load large spacy model 
nlp = spacy.load('en_core_web_lg')

# Aggregate title and content
title_weight = 1
df['agg_title_body'] = title_weight*(df['title']+'. ') + df['body']

In [30]:
df[df['body'].str.contains("Park Geun-hye's", na=False)]

Unnamed: 0,title,author,time,description,body,section,summarized_body
84,Korea's ODA project tainted by corruption scan...,Yonhap,2017-12-26 16:02:00,Impeached President Park Geun-hye's close frie...,Impeached President Park Geun-hye's close frie...,Politics,The probe looked into allegations that the pri...
279,Professors pick 'fight for right' as words sym...,Yonhap,2017-12-17 11:24:00,A group of college professors has picked the f...,A group of college professors has picked the f...,Social affairs,"Chinese idiom ""Hasahyeonjeong"" means a fight f..."
327,Lotte in disarray over its head's possible jai...,Yonhap,2017-12-14 18:18:00,South Korean retail giant Lotte is in disarray...,South Korean retail giant Lotte is in disarray...,Social affairs,Shin Dong-bin is accused of giving 7 billion w...
562,Korean citizens win German human rights award ...,Yonhap,2017-12-06 11:13:00,"BERLIN -- The German political foundation, Fri...","BERLIN -- The German political foundation, Fri...",Social affairs,"Jang Ae-jin, a survivor of the 2014 Sewol ferr..."
737,Ex-Park aide to be grilled over fresh suspicio...,Yonhap,2017-11-29 09:48:00,Prosecutors questioned one of former President...,Prosecutors questioned one of former President...,Social affairs,"Woo Byung-woo, ex-senior presidential secretar..."
...,...,...,...,...,...,...,...
23355,Opposition party vows to review new tax scheme,KH디지털2,2015-01-20 13:53:00,The main opposition party vowed Tuesday to rev...,The main opposition party vowed Tuesday to rev...,Politics,The main opposition party vowed Tuesday to rev...
23440,Park's approval rating hits record low,KH디지털2,2015-01-16 15:23:00,President Park Geun-hye's job approval rating ...,President Park Geun-hye's job approval rating ...,Politics,35 percent of those polled approved of the way...
23505,"Ruling party, presidential office need closer ...",KH디지털2,2015-01-14 14:19:00,The chief of the ruling party acknowledged Wed...,The chief of the ruling party acknowledged Wed...,Politics,Saenuri Party Chairman Kim Moo-sung was photog...
23532,S. Korea eyes express train project linking Eu...,KH디지털2,2015-01-13 15:22:00,South Korea is seeking to operate a special ex...,South Korea is seeking to operate a special ex...,International,The envisioned express train is likely to pass...


In [32]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"
PARK_WORD = "Park Geun-hye"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
    
    # Change to "Park Geun-hye"
    "President Park Geun-hye's": PARK_WORD,
    "President Park Geun-Hye's": PARK_WORD,
    "President Park Geun Hye's": PARK_WORD,
    "President Park Geun-hye": PARK_WORD,
    "President Park Geun-Hye": PARK_WORD,
    "President Park Geun Hye": PARK_WORD,
    "President Park's": PARK_WORD,
    "President Park": PARK_WORD,
    "Park Geun-hye's": PARK_WORD,
    "Park Geun-Hye's": PARK_WORD,
    "Park Geun Hye's": PARK_WORD,
    "Park Geun-Hye": PARK_WORD,
    "Park Geun Hye": PARK_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens

### Preprocess function for grouping similar topic
def preprocess_manual(s: str):
    # Change similar words to the same word
    new_str = transform_to_similar_sentence(s)
    # Remove punctuation
    new_str = ''.join(ch if ch not in set(punctuation) else " " for ch in new_str)
    # Remove all single characters
    new_str = re.sub(r'\W', ' ', new_str)
    new_str = re.sub(r'\s+[a-zA-Z]\s+', ' ', new_str)
    new_str = re.sub(r'\^[a-zA-Z]\s+', ' ', new_str) 
    # Substituting multiple spaces with single space
    new_str = re.sub(r'\s+', ' ', new_str, flags=re.I)
    # Removing prefixed 'b' - when data is in bytes format
    new_str = re.sub(r'^b\s+', '', new_str)
    # Removing all numbers
    new_str = new_str.translate(str.maketrans('', '', digits))
    # Converting to Lowercase
    new_str = new_str.lower()
    # Lemmatization and remove stopwords
    new_str = new_str.split()
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [stemmer.lemmatize(word) for word in new_str if word not in stopwords]
    new_str = ' '.join(tokens)
    
    return new_str, tokens

In [33]:
### Make TF-IDF matrix
def tfidf_embed(documents, dimension=None):
    # documents: list of str
    # dim: integer
    embeddings_dict = {}
    tfidf_vectorizer = TfidfVectorizer(input='content', tokenizer=spacy_tokenizer)
    tfidf_vector = tfidf_vectorizer.fit_transform(documents)
    
    # Dimensionality Reduction
    if dimension is not None:
        svd_doc = TruncatedSVD(n_components=dimension, n_iter=5, random_state=42)
        tfidf_vector = svd_doc.fit_transform(tfidf_vector)
    return tfidf_vector

### Make GloVe matrix
glove_file = "../glove.42B.300d.txt"
def glove_word_vector():
    embeddings_dict = {}
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

# Average sum of word vectors
def sentence_embed(sentence, word_vectors, dimension):
    sum_vector = np.zeros(dimension)
    for w in sentence.split():
        if w in word_vectors:
            sum_vector += word_vectors[w]
    return sum_vector/len(sentence)

# Make document vector
def document_embed(documents, embedding_technique='tfidf', dimension=None):
    if embedding_technique=='tfidf':
        doc_vector = tfidf_embed(documents, dimension)
    elif embedding_technique=='glove':
        word_vector = glove_word_vector()
        if dimension is None:
            dimension = 300
        doc_vector = [ sentence_embed(s, word_vector, dimension).tolist() for s in documents ]
    elif embedding_technique=='spacy':
        doc_vector = [doc.vector for doc in documents]
    
    return doc_vector

In [45]:
park_str = "President Park Geun-hye's|President Park Geun-Hye's|President Park Geun Hye's|President Park Geun-hye|President Park Geun-Hye|President Park Geun Hye|President Park's|President Park|Park Geun-hye's|Park Geun-Hye's|Park Geun Hye's|Park Geun-Hye|Park Geun-hye|Park Geun Hye"

df_park = df[df['body'].str.contains(park_str, na=False, flags=re.IGNORECASE, regex=True)]
df_park

Unnamed: 0,title,author,time,description,body,section,summarized_body,agg_title_body
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,The corruption scandal that broke out in 2016 ...,[Weekender] Korea’s dynamic 2017. From North K...
13,Headline makers of 2017,Korea Herald,2017-12-29 16:42:00,Moon Jae-inPresident Moon Jae-in (Yonhap)Moon ...,Moon Jae-inPresident Moon Jae-in (Yonhap)Moon ...,Social affairs,South Korean President Moon Jae-in rose to bec...,Headline makers of 2017. Moon Jae-inPresident ...
16,"Moon pardons 6,444 people, excluding business ...",Ock Hyun-ju,2017-12-29 16:01:00,"President Moon Jae-in granted pardons to 6,444...","President Moon Jae-in granted pardons to 6,444...",Social affairs,"President Moon Jae-in granted pardons to 6,444...","Moon pardons 6,444 people, excluding business ..."
19,Businessmen call for probe into shutdown of fa...,Yonhap,2017-12-29 15:04:00,A private task force on Friday pressed the gov...,A private task force on Friday pressed the gov...,North Korea,South Korea pulled the plug on the factory par...,Businessmen call for probe into shutdown of fa...
21,Special pardons aimed at helping ordinary peop...,Yonhap,2017-12-29 11:39:00,"The latest pardon extended to more than 6,000 ...","The latest pardon extended to more than 6,000 ...",Politics,"The latest pardon extended to more than 6,000 ...",Special pardons aimed at helping ordinary peop...
...,...,...,...,...,...,...,...,...
23760,‘Responsibility to protect does not apply to N...,Korea Herald,2015-01-01 21:21:00,This is the second installment in a special Ne...,This is the second installment in a special Ne...,Defense,Harvard University professor Stephen M. Walt s...,‘Responsibility to protect does not apply to N...
23762,Industry resists policy to bar foreign teacher...,Yoon Min-sik,2015-01-01 20:47:00,The Education Ministry said last month it was ...,The Education Ministry said last month it was ...,Education,The Education Ministry said last month it was ...,Industry resists policy to bar foreign teacher...
23764,N. Korean leader's speech arouses cautious opt...,KH디지털2,2015-01-01 13:36:00,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea,North Korean leader Kim Jong-un's New Year's D...,N. Korean leader's speech arouses cautious opt...
23765,N. Korean leader open to inter-Korean summit t...,KH디지털2,2015-01-01 10:05:00,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea,North Korean leader Kim Jong-un said he is wil...,N. Korean leader open to inter-Korean summit t...


In [46]:
df_park.to_csv("data/df_park.csv", index=True)