### Libs & Data

In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

import spacy
from spacy.matcher import Matcher

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kwsst\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kwsst\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
df = pd.read_csv("data/data_sum.csv")
df.shape

(23769, 7)

In [43]:
df.head()

Unnamed: 0,title,author,time,description,body,section,summarized_body
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs,"As of 2016, more than 2 million foreign nation..."
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,The corruption scandal that broke out in 2016 ...
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics,"Ahn Cheol-soo, leader of the center-left Peopl..."
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea,"The 5,100-ton KOTI is being held in the wester..."
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea,The crew of a Hong Kong-registered ship have b...


In [44]:
# Load large spacy model 
nlp = spacy.load('en_core_web_lg')

# Aggregate title and content
title_weight = 1
df['agg_title_body'] = title_weight*(df['title']+'. ') + df['body']

In [45]:
df['agg_title_body'].str.contains("President Park Geun-hye's|President Park Geun-Hye's|President Park Geun Hye's|President Park Geun-hye|President Park Geun-Hye|President Park Geun Hye|President Park's|President Park|Park Geun-hye's|Park Geun-Hye's|Park Geun Hye's|Park Geun-Hye|Park Geun-hye|Park Geun Hye")

0        False
1         True
2        False
3        False
4        False
         ...  
23764     True
23765     True
23766    False
23767    False
23768     True
Name: agg_title_body, Length: 23769, dtype: object

In [50]:
park_str = "President Park Geun-hye's|President Park Geun-Hye's|President Park Geun Hye's|President Park Geun-hye|President Park Geun-Hye|President Park Geun Hye|President Park's|President Park|Park Geun-hye's|Park Geun-Hye's|Park Geun Hye's|Park Geun-Hye|Park Geun-hye|Park Geun Hye"
df_nk = df[(df['section']=='North Korea') & (df['agg_title_body'].str.contains('issile', flags=re.IGNORECASE, regex=True))]
df_politics = df[(df['section']=='Politics') & (df['agg_title_body'].str.contains(park_str, flags=re.IGNORECASE, regex=True))]

df_nk.to_csv("data/df_nk.csv", index=True)
df_politics.to_csv("data/df_politics.csv", index=True)

df_nk

Unnamed: 0,title,author,time,description,body,section,summarized_body,agg_title_body
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea,"The 5,100-ton KOTI is being held in the wester...",[Newsmaker] Panamanian vessel probed over susp...
7,Secret Sauce? Kim Jong-un applies science to k...,AP,2017-12-30 12:10:00,Kim Jong Un wants to turn the art of kimchi-ma...,Kim Jong Un wants to turn the art of kimchi-ma...,North Korea,"Ryugyong Kimchi Factory produces 4,200 tons of...",Secret Sauce? Kim Jong-un applies science to k...
8,N. Korea says there will be no change to its n...,Yonhap,2017-12-30 10:31:00,North Korea will continue to enhance its nucle...,North Korea will continue to enhance its nucle...,North Korea,North Korea will continue to enhance its nucle...,N. Korea says there will be no change to its n...
14,"Top diplomats of S. Korea, US reassure peacefu...",Yonhap,2017-12-29 16:31:00,The top diplomats of South Korea and the Unite...,The top diplomats of South Korea and the Unite...,North Korea,Top diplomats of South Korea and the U.S. held...,"Top diplomats of S. Korea, US reassure peacefu..."
19,Businessmen call for probe into shutdown of fa...,Yonhap,2017-12-29 15:04:00,A private task force on Friday pressed the gov...,A private task force on Friday pressed the gov...,North Korea,South Korea pulled the plug on the factory par...,Businessmen call for probe into shutdown of fa...
...,...,...,...,...,...,...,...,...
23694,"N. Korea likely to conduct series of nuclear, ...",KH디지털2,2015-01-06 10:22:00,The United States and South Korea should brace...,The United States and South Korea should brace...,North Korea,"Victor Cha, chief Korea analyst at the Center ...","N. Korea likely to conduct series of nuclear, ..."
23727,Tension between U.S. and N. Korea escalates,Korea Herald,2015-01-04 22:03:00,Tension between the United States and North Ko...,Tension between the United States and North Ko...,North Korea,U.S. President Barack Obama signed an executiv...,Tension between U.S. and N. Korea escalates. T...
23739,U.S. slaps new sanctions on N. Korea in respon...,김영원,2015-01-03 10:14:00,The United States announced retaliatory sancti...,The United States announced retaliatory sancti...,North Korea,Three North Korean entities and 10 officials a...,U.S. slaps new sanctions on N. Korea in respon...
23757,U.S. places sanctions on N. Korean firm,Kim Yon-se,2015-01-01 21:22:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea,Ryongaksan General Trading Corp. was on the li...,U.S. places sanctions on N. Korean firm. The U...


In [54]:
dfn = df[(df['section']=='North Korea') & (df['agg_title_body'].str.contains('Missile', flags=re.IGNORECASE, regex=True))]
dfn

Unnamed: 0,title,author,time,description,body,section,summarized_body,agg_title_body
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea,"The 5,100-ton KOTI is being held in the wester...",[Newsmaker] Panamanian vessel probed over susp...
7,Secret Sauce? Kim Jong-un applies science to k...,AP,2017-12-30 12:10:00,Kim Jong Un wants to turn the art of kimchi-ma...,Kim Jong Un wants to turn the art of kimchi-ma...,North Korea,"Ryugyong Kimchi Factory produces 4,200 tons of...",Secret Sauce? Kim Jong-un applies science to k...
8,N. Korea says there will be no change to its n...,Yonhap,2017-12-30 10:31:00,North Korea will continue to enhance its nucle...,North Korea will continue to enhance its nucle...,North Korea,North Korea will continue to enhance its nucle...,N. Korea says there will be no change to its n...
14,"Top diplomats of S. Korea, US reassure peacefu...",Yonhap,2017-12-29 16:31:00,The top diplomats of South Korea and the Unite...,The top diplomats of South Korea and the Unite...,North Korea,Top diplomats of South Korea and the U.S. held...,"Top diplomats of S. Korea, US reassure peacefu..."
19,Businessmen call for probe into shutdown of fa...,Yonhap,2017-12-29 15:04:00,A private task force on Friday pressed the gov...,A private task force on Friday pressed the gov...,North Korea,South Korea pulled the plug on the factory par...,Businessmen call for probe into shutdown of fa...
...,...,...,...,...,...,...,...,...
23694,"N. Korea likely to conduct series of nuclear, ...",KH디지털2,2015-01-06 10:22:00,The United States and South Korea should brace...,The United States and South Korea should brace...,North Korea,"Victor Cha, chief Korea analyst at the Center ...","N. Korea likely to conduct series of nuclear, ..."
23727,Tension between U.S. and N. Korea escalates,Korea Herald,2015-01-04 22:03:00,Tension between the United States and North Ko...,Tension between the United States and North Ko...,North Korea,U.S. President Barack Obama signed an executiv...,Tension between U.S. and N. Korea escalates. T...
23739,U.S. slaps new sanctions on N. Korea in respon...,김영원,2015-01-03 10:14:00,The United States announced retaliatory sancti...,The United States announced retaliatory sancti...,North Korea,Three North Korean entities and 10 officials a...,U.S. slaps new sanctions on N. Korea in respon...
23757,U.S. places sanctions on N. Korean firm,Kim Yon-se,2015-01-01 21:22:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea,Ryongaksan General Trading Corp. was on the li...,U.S. places sanctions on N. Korean firm. The U...


In [51]:
df_politics

Unnamed: 0,title,author,time,description,body,section,summarized_body,agg_title_body
21,Special pardons aimed at helping ordinary peop...,Yonhap,2017-12-29 11:39:00,"The latest pardon extended to more than 6,000 ...","The latest pardon extended to more than 6,000 ...",Politics,"The latest pardon extended to more than 6,000 ...",Special pardons aimed at helping ordinary peop...
34,[News Focus] Is multiparty system viable in Ko...,Jo He-rim,2017-12-28 16:29:00,A four-day vote of confidence in the People’s ...,A four-day vote of confidence in the People’s ...,Politics,A four-day vote of confidence in the People’s ...,[News Focus] Is multiparty system viable in Ko...
37,Political parties call on government to resolv...,Jo He-rim,2017-12-28 16:13:00,Political parties on Thursday expressed anger ...,Political parties on Thursday expressed anger ...,Politics,Political parties on Thursday expressed anger ...,Political parties call on government to resolv...
84,Korea's ODA project tainted by corruption scan...,Yonhap,2017-12-26 16:02:00,Impeached President Park Geun-hye's close frie...,Impeached President Park Geun-hye's close frie...,Politics,The probe looked into allegations that the pri...,Korea's ODA project tainted by corruption scan...
174,Ex-President Park summoned for questioning Friday,Ock Hyun-ju,2017-12-20 17:51:00,The prosecution summoned former President Park...,The prosecution summoned former President Park...,Politics,The prosecution summoned former President Park...,Ex-President Park summoned for questioning Fri...
...,...,...,...,...,...,...,...,...
23741,Park trumpets unification drive,Korea Herald,2015-01-02 21:36:00,President Park Geun-hye requested political le...,President Park Geun-hye requested political le...,Politics,President Park Geun-hye requested political le...,Park trumpets unification drive. President Par...
23745,Park vows substantial preparations for unifica...,박한나,2015-01-02 20:09:00,South Korean President Park Geun-hye pledged F...,South Korean President Park Geun-hye pledged F...,Politics,South Korean President Park Geun-hye pledged F...,Park vows substantial preparations for unifica...
23751,Opposition leader calls on Park to focus on in...,KH디지털2,2015-01-02 11:17:00,The leader of the main opposition party called...,The leader of the main opposition party called...,Politics,The leader of the main opposition party called...,Opposition leader calls on Park to focus on in...
23758,Park vows efforts to end Korean division,Korea Herald,2015-01-01 21:22:00,President Park Geun-hye on Thursday called for...,President Park Geun-hye on Thursday called for...,Politics,President Park Geun-hye called for an end to t...,Park vows efforts to end Korean division. Pres...


In [24]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"
PARK_WORD = "Park Geun-hye"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
    
    # Change to "Park Geun-hye"
    "President Park Geun-hye's": PARK_WORD,
    "President Park Geun-Hye's": PARK_WORD,
    "President Park Geun Hye's": PARK_WORD,
    "President Park Geun-hye": PARK_WORD,
    "President Park Geun-Hye": PARK_WORD,
    "President Park Geun Hye": PARK_WORD,
    "President Park's": PARK_WORD,
    "President Park": PARK_WORD,
    "Park Geun-hye's": PARK_WORD,
    "Park Geun-Hye's": PARK_WORD,
    "Park Geun Hye's": PARK_WORD,
    "Park Geun-Hye": PARK_WORD,
    "Park Geun Hye": PARK_WORD,
}

### Transform function
def text_cleaning(s: str):
        
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
    return s

def spacy_tokenizer(s: str):
    # Change similar terms to the same term
    new_str = text_cleaning(s)
    doc = nlp(s)
    # Group tokens
    matcher = Matcher(nlp.vocab)
    token_groupup_pattern = [
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "nations"}],
        [{"LOWER": "the"}, {"LOWER": "united"}, {"LOWER": "states"}],
        [{"LOWER": "north"}, {"LOWER": "korea"}],
        [{"LOWER": "south"}, {"LOWER": "korea"}],
    ]
    matcher.add("TermGroup",token_groupup_pattern)
    matches = matcher(doc)
    merge_doc = []
    for nid, start, end in matches:
        merge_doc.append((start,end))
    with doc.retokenize() as retokenizer:
        for i in range(len(merge_doc)-1,-1,-1):
            retokenizer.merge(doc[merge_doc[i][0]:merge_doc[i][1]])
        
    # Remove all stopword, punctuation, number
    tokens = [ token.lemma_.lower() for token in doc \
              if not token.is_stop and not token.is_punct and not token.like_num and token.lemma_.strip()!= '']
    return tokens

### Preprocess function for grouping similar topic
def preprocess_manual(s: str):
    # Change similar words to the same word
    new_str = transform_to_similar_sentence(s)
    # Remove punctuation
    new_str = ''.join(ch if ch not in set(punctuation) else " " for ch in new_str)
    # Remove all single characters
    new_str = re.sub(r'\W', ' ', new_str)
    new_str = re.sub(r'\s+[a-zA-Z]\s+', ' ', new_str)
    new_str = re.sub(r'\^[a-zA-Z]\s+', ' ', new_str) 
    # Substituting multiple spaces with single space
    new_str = re.sub(r'\s+', ' ', new_str, flags=re.I)
    # Removing prefixed 'b' - when data is in bytes format
    new_str = re.sub(r'^b\s+', '', new_str)
    # Removing all numbers
    new_str = new_str.translate(str.maketrans('', '', digits))
    # Converting to Lowercase
    new_str = new_str.lower()
    # Lemmatization and remove stopwords
    new_str = new_str.split()
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [stemmer.lemmatize(word) for word in new_str if word not in stopwords]
    new_str = ' '.join(tokens)
    
    return new_str, tokens

In [25]:
### Make TF-IDF matrix
def tfidf_embed(documents, dimension=None):
    # documents: list of str
    # dim: integer
    embeddings_dict = {}
    tfidf_vectorizer = TfidfVectorizer(input='content', tokenizer=spacy_tokenizer)
    tfidf_vector = tfidf_vectorizer.fit_transform(documents)
    
    # Dimensionality Reduction
    if dimension is not None:
        svd_doc = TruncatedSVD(n_components=dimension, n_iter=5, random_state=42)
        tfidf_vector = svd_doc.fit_transform(tfidf_vector)
    return tfidf_vector

### Make GloVe matrix
glove_file = "../glove.42B.300d.txt"
def glove_word_vector():
    embeddings_dict = {}
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

# Average sum of word vectors
def sentence_embed(sentence, word_vectors, dimension):
    sum_vector = np.zeros(dimension)
    for w in sentence.split():
        if w in word_vectors:
            sum_vector += word_vectors[w]
    return sum_vector/len(sentence)

# Make document vector
def document_embed(documents, embedding_technique='tfidf', dimension=None):
    if embedding_technique=='tfidf':
        doc_vector = tfidf_embed(documents, dimension)
    elif embedding_technique=='glove':
        word_vector = glove_word_vector()
        if dimension is None:
            dimension = 300
        doc_vector = [ sentence_embed(s, word_vector, dimension).tolist() for s in documents ]
    elif embedding_technique=='spacy':
        doc_vector = [doc.vector for doc in documents]
    
    return doc_vector

In [26]:
# park_str = "President Park Geun-hye's|President Park Geun-Hye's|President Park Geun Hye's|President Park Geun-hye|President Park Geun-Hye|President Park Geun Hye|President Park's|President Park|Park Geun-hye's|Park Geun-Hye's|Park Geun Hye's|Park Geun-Hye|Park Geun-hye|Park Geun Hye"
# nk_str = "North Korea|North Korea's|NK|NK's|N. Korea"

# df_park = df[df['body'].str.contains(park_str, na=False, flags=re.IGNORECASE, regex=True)]
# df_park

In [52]:
tfidf_nk = document_embed(df_nk['agg_title_body'], embedding_technique='tfidf', dimension=300)
joblib.dump(tfidf_nk, 'tfidf_nk.csv')
print("TDIDF_NK is saved")

TDIDF_NK is saved


In [53]:
tfidf_politics = document_embed(df_politics['agg_title_body'], embedding_technique='tfidf', dimension=300)
joblib.dump(tfidf_politics, 'tfidf_politics.csv')
print("TDIDF_POLITICS is saved")

TDIDF_POLITICS is saved


In [None]:
# tfidf_nk_politics = document_embed(df_nk_politics, embedding_technique='tfidf', dimension=300)
# joblib.dump(tfidf_nk_politics, 'tfidf_nk.csv')
# print("TDIDF_NK_POLITICS is saved")