### Libs & Data

In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import unicodedata
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

import spacy
from spacy.matcher import Matcher

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kwsst\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kwsst\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("data/data_sum.csv")
df.shape

(23769, 7)

In [3]:
df.head()

Unnamed: 0,title,author,time,description,body,section,summarized_body
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs,"As of 2016, more than 2 million foreign nation..."
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,The corruption scandal that broke out in 2016 ...
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics,"Ahn Cheol-soo, leader of the center-left Peopl..."
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea,"The 5,100-ton KOTI is being held in the wester..."
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea,The crew of a Hong Kong-registered ship have b...


In [4]:
df2015 = df[('2015' < df['time']) & (df['time'] < '2016')]
df2016 = df[('2016' < df['time']) & (df['time'] < '2017')]
df2017 = df['2017' < df['time']]

In [5]:
df2015.shape, df2016.shape, df2017.shape

((7156, 7), (7485, 7), (9128, 7))

In [6]:
### Clustering 
def document_clustering(doc_vectors, clustering_method='kmeans', evaluate=False):
    if clustering_method=='kmeans':
        # Hyperparameters
        k_event = 10000
        k_issue = 6000
        
        # Clustering event
        kmeans_event = KMeans(n_clusters=k_event, random_state=69).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((k_event, doc_vectors.shape[1]))
        for i in range(k_event):
            event_vectors[i] = sum(doc_vectors[kmeans_event.labels_ == i])
        
        # Clustering issue
        kmeans_issue = KMeans(n_clusters=k_issue, random_state=69).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((k_issue, doc_vectors.shape[1]))
        for i in range(k_issue):
            issue_vectors[i] = sum(event_vectors[kmeans_issue.labels_ == i])

        issue_labels = np.array([ kmeans_issue.labels_[kmeans_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return k_issue, k_event, issue_labels, kmeans_event.labels_
    
    elif clustering_method=='DBSCAN':
        
        # Hyperparameters
        doc_eps = 0.19
        doc_neighbors = 1
        event_eps = 0.40
        event_neighbors = 1
        
        '''
            Find best doc_eps and event_eps
        '''
        if evaluate:
            # Find best eps to group same document
            doc_eps_list = [ 0.15 + 0.001*i for i in range(1,101) ]
            doc_score = []
            doc_event = []
            doc_best_score = 0
            doc_best_eps = 0.0001
            for doc_eps in doc_eps_list:
                # Clustering event
                db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
                # Number of clusters in labels, ignoring noise if present.
                n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
                if len(set(db_event.labels_)) >= 2 and len(set(db_event.labels_)) <= len(doc_vectors)-1:
                    score_ = silhouette_score(doc_vectors, db_event.labels_)
                else:
                    score_ = -1
                doc_event.append(n_events_)
                doc_score.append(score_)
                if score_ > doc_best_score:
                    doc_best_score = score_
                    doc_best_eps = doc_eps
            print("Best Silhouete score is {} at eps: {} and number of events: {}".format(doc_best_score, doc_eps, n_events_))
            fig = plt.figure()
            plt.plot(doc_eps_list, doc_score)
            fig.suptitle('Doc eps and Silhouette score', fontsize=20)
            plt.xlabel('eps', fontsize=18)
            plt.ylabel('Silhouette score', fontsize=16)
            plt.show()
            
            fig = plt.figure()
            plt.plot(doc_eps_list, doc_event)
            fig.suptitle('Doc eps and number of events', fontsize=20)
            plt.xlabel('eps', fontsize=18)
            plt.ylabel('number of events', fontsize=16)
            plt.show()
            
            # Set doc_eps to the best value
            doc_eps = doc_best_eps
            # Find best eps to group same event
            # Clustering event
            db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
            n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
            event_labels = np.array(list(map(lambda x: n_events_ if x==-1 else x, db_event.labels_)))
            event_vectors = np.zeros((n_events_, doc_vectors.shape[1]))
            for i in range(n_events_+1):
                if np.sum(event_labels == i) != 0:
                    event_vectors[i] = np.sum(doc_vectors[event_labels == i], axis=0)/np.sum(event_labels == i)
            
            
#             # Clustering issue
#             event_eps_list = [ 0.2 + 0.001*i for i in range(1,401) ]
#             event_score = []
#             event_issue = []
#             event_best_score = 0
#             event_best_eps = 0.001
#             for event_eps in event_eps_list:
#                 db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
#                 # Number of clusters in labels, ignoring noise if present.
#                 n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
#                 if len(set(db_issue.labels_)) >= 2 and len(set(db_issue.labels_)) <= len(event_vectors)-1:
#                     score_ = silhouette_score(event_vectors, db_issue.labels_)
#                 else:
#                     score_ = -1
#                 event_issue.append(n_issues_)
#                 event_score.append(score_)
#                 if score_ > event_best_score:
#                     event_best_score = score_
#                     event_best_eps = event_eps
#             print("Best Silhouete score is {} at eps: {} and number of issues: {}".format(event_best_score, event_eps, n_issues_))
#             fig = plt.figure()
#             plt.plot(event_eps_list, event_score)
#             fig.suptitle('Event eps and Silhouette score', fontsize=20)
#             plt.xlabel('eps', fontsize=18)
#             plt.ylabel('Silhouette score', fontsize=16)
#             plt.show()
            
#             fig = plt.figure()
#             plt.plot(event_eps_list, event_issue)
#             fig.suptitle('Event eps and number of issues', fontsize=20)
#             plt.xlabel('eps', fontsize=18)
#             plt.ylabel('number of issues', fontsize=16)
#             plt.show()
            
            # Set event_eps to best value
            event_eps = 0.5
            # Clustering issue
            db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
            n_noise_ = list(db_issue.labels_).count(-1)
            print(n_issues_, n_noise_)
            # Represent each issue by average sum of related news
            issue_labels = np.array(list(map(lambda x: n_issues_ if x==-1 else x, db_issue.labels_)))
            issue_vectors = np.zeros((n_issues_, doc_vectors.shape[1]))
            for i in range(n_issues_+1):
                if np.sum(issue_labels == i) != 0:
                    issue_vectors[i] = np.sum(event_vectors[issue_labels == i], axis=0)/np.sum(issue_labels == i)
       
            issue_labels = np.array([ issue_labels[event_labels[i]] for i in range(doc_vectors.shape[0]) ])
        
        else:
            '''
            Clustering using specific value
            '''
            # Clustering event
            db_event = DBSCAN(eps=doc_eps, min_samples=doc_neighbors).fit(doc_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_events_ = len(set(db_event.labels_)) - (1 if -1 in db_event.labels_ else 0)
            n_noise_ = list(db_event.labels_).count(-1)
            print(n_events_, n_noise_)
            # Represent each event by average sum of related news
            event_labels = np.array(list(map(lambda x: n_events_ if x==-1 else x, db_event.labels_)))
            event_vectors = np.zeros((n_events_, doc_vectors.shape[1]))
            for i in range(n_events_+1):
                if np.sum(event_labels == i) != 0:
                    event_vectors[i] = np.sum(doc_vectors[event_labels == i], axis=0)/np.sum(event_labels == i)

            # Clustering issue
            db_issue = DBSCAN(eps=event_eps, min_samples=event_neighbors).fit(event_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_issues_ = len(set(db_issue.labels_)) - (1 if -1 in db_issue.labels_ else 0)
            n_noise_ = list(db_issue.labels_).count(-1)
            print(n_issues_, n_noise_)
            # Represent each issue by average sum of related news
            issue_labels = np.array(list(map(lambda x: n_issues_ if x==-1 else x, db_issue.labels_)))
            issue_vectors = np.zeros((n_issues_, doc_vectors.shape[1]))
            for i in range(n_issues_+1):
                if np.sum(issue_labels == i) != 0:
                    issue_vectors[i] = np.sum(event_vectors[issue_labels == i], axis=0)/np.sum(issue_labels == i)
        
            issue_labels = np.array([ issue_labels[event_labels[i]] for i in range(doc_vectors.shape[0]) ])
        
        return n_issues_, n_events_, issue_labels, event_labels
    
    elif clustering_method=='agglomerative':
        # Hyperparameters
        n_events = 10000
        n_issues = 6000
        
        # Clustering event
        agg_event = AgglomerativeClustering(distance_threshold=0, n_clusters=n_events).fit(doc_vectors)
        # Represent each event by average sum of related news
        event_vectors = np.zeros((n_events, doc_vectors.shape[1]))
        for i in range(n_events):
            event_vectors[i] = sum(doc_vectors[agg_event.labels_ == i])
        
        plt.title("Hierarchical Clustering Dendrogram")
        # plot the top three levels of the dendrogram
        plot_dendrogram(agg_event, truncate_mode="level", p=3)
        plt.xlabel("Number of points in node (or index of point if no parenthesis).")
        plt.show()
        
        # Clustering issue
        agg_issue = AgglomerativeClustering(distance_threshold=0, n_clusters=n_issues).fit(event_vectors)
        # Represent each issue by average sum of related news
        issue_vectors = np.zeros((n_issues, doc_vectors.shape[1]))
        for i in range(n_issues):
            issue_vectors[i] = sum(event_vectors[agg_issue.labels_ == i])

        issue_labels = np.array([ agg_issue.labels_[agg_event.labels_[i]] for i in range(doc_vectors.shape[0]) ])
        
        return agg_issue, agg_event, issue_labels, agg_event.labels_
    
    elif clustering_method=='LDA':
        
        pass
    
    else:
        assert("Doesn't support {}".format(clustering_method))   

In [7]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

### Import the embedded documents

In [8]:
vects_2015 = joblib.load('data/tfidf_titlebody_2015.csv')
vects_2016 = joblib.load('data/tfidf_titlebody_2016.csv')
vects_2017 = joblib.load('data/tfidf_titlebody_2017.csv')

### Cluster

In [9]:
num_issue_2015, num_event_2015, issue_labels_2015, event_labels_2015 = document_clustering(vects_2015,
                                                                                           clustering_method='DBSCAN',
                                                                                           evaluate=False)

6919 0
4092 0


In [10]:
num_issue_2016, num_event_2016, issue_labels_2016, event_labels_2016 = document_clustering(vects_2016,
                                                                                           clustering_method='DBSCAN',
                                                                                           evaluate=False)

7335 0
4729 0


In [11]:
num_issue_2017, num_event_2017, issue_labels_2017, event_labels_2017 = document_clustering(vects_2017,
                                                                                           clustering_method='DBSCAN',
                                                                                           evaluate=False)

9000 0
5630 0


In [12]:
# Add label to each doc (which group it belongs to)
df2015['label'] = event_labels_2015
df2015.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,title,author,time,description,body,section,summarized_body,label
16613,"S. Korea, China establish military hotline",김영원,2015-12-31 16:52:00,South Korea and China set up a hotline between...,South Korea and China set up a hotline between...,Defense,Defense Minister Han Min-koo had his first tel...,0
16614,FM Yun defends 'comfort women' deal,Kim Young-won,2015-12-31 16:48:00,Foreign Minister Yun Byung-se stepped up effor...,Foreign Minister Yun Byung-se stepped up effor...,,Foreign Minister Yun Byung-se steps up efforts...,1
16615,Presidential office refutes rumors on Seoul-To...,KH디지털2,2015-12-31 15:52:00,"Cheong Wa Dae, South Korea's presidential offi...","Cheong Wa Dae, South Korea's presidential offi...",Politics,South Korea's presidential office refuted rumo...,2
16616,Assembly ends with little achieved,이주희,2015-12-31 15:33:00,The National Assembly held its last general as...,The National Assembly held its last general as...,Social affairs,Parliamentary Speaker Chung Ui-hwa said that h...,3
16617,Non-emergency patients to face ER fee hikes,이주희,2015-12-31 15:31:00,Non-emergency patients will face much higher m...,Non-emergency patients will face much higher m...,Social affairs,Non-emergency patients will face much higher m...,4


In [13]:
df2016['label'] = event_labels_2016
df2016.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,title,author,time,description,body,section,summarized_body,label
9128,Chinese airlines withdraw plans for chartered ...,조정은,2016-12-31 16:36:00,Chinese airlines have withdrawn plans to run c...,Chinese airlines have withdrawn plans to run c...,Diplomatic Circuit,Two Chinese air carriers rolled back requests ...,0
9129,Ex-health minister grilled over scandal involv...,조정은,2016-12-31 16:14:00,South Korea's special prosecutors on Saturday ...,South Korea's special prosecutors on Saturday ...,Social affairs,"Moon Hyung-pyo, now chief of the National Pens...",1
9130,Highly pathogenic strain of bird flu found in ...,조정은,2016-12-31 12:09:00,A highly pathogenic strain of bird flu was dis...,A highly pathogenic strain of bird flu was dis...,Social affairs,The H5N6 strain of avian influenza was found i...,2
9131,S. Koreans demand president's removal on New Y...,조정은,2016-12-31 11:40:00,"Even on New Year's Eve, large crowds of South ...","Even on New Year's Eve, large crowds of South ...",Social affairs,Hundreds of thousands were expected to partici...,3
9132,S. Korea's pension fund chief formally arreste...,조정은,2016-12-31 11:20:00,Special prosecutors on Saturday formally arres...,Special prosecutors on Saturday formally arres...,Politics,"Moon Hyung-pyo, chief of the National Pension ...",4


In [14]:
df2017['label'] = event_labels_2017
df2017.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,title,author,time,description,body,section,summarized_body,label
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs,"As of 2016, more than 2 million foreign nation...",0
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,The corruption scandal that broke out in 2016 ...,1
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics,"Ahn Cheol-soo, leader of the center-left Peopl...",2
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea,"The 5,100-ton KOTI is being held in the wester...",3
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea,The crew of a Hong Kong-registered ship have b...,4


In [15]:
# get top n most frequent labels
n = 20
label_2015 = df2015['label'].value_counts()[:n].index.tolist()
label_2016 = df2016['label'].value_counts()[:n].index.tolist()
label_2017 = df2017['label'].value_counts()[:n].index.tolist()
print(label_2015)
print(label_2016)
print(label_2017)

[2957, 132, 6018, 5457, 5039, 3617, 5864, 1224, 1840, 5854, 6487, 2169, 5368, 2867, 2072, 2514, 713, 4335, 3506, 6596]
[16, 518, 5095, 705, 5782, 7166, 6234, 431, 4376, 7255, 5203, 5446, 7263, 6913, 2202, 6789, 6897, 4882, 4922, 5693]
[67, 617, 4735, 3708, 6654, 5, 4112, 911, 813, 5035, 6416, 3142, 6911, 3052, 3810, 8536, 5872, 2167, 5893, 7940]


In [18]:
def freq_label(df, labels):
    freqs = []
    for label in labels:
        freq = df.loc[df.label == label, 'label'].count()
        freqs.append(freq)
    return freqs
freq_label(df2015, label_2015)

[6, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2]

In [25]:
# Concat the sentences in each group and make a summary
summary1 = []
for i in label_2015:
    s = ''
    for sentence in df2015[df2015['label'] == i]['summarized_body'].tolist():
        s += sentence
    summary1.append(s)
len(summary1)

20

In [26]:
summary2 = []
for i in label_2016:
    s = ''
    for sentence in df2016[df2016['label'] == i]['summarized_body'].tolist():
        s += sentence
    summary2.append(s)
len(summary2)

20

In [27]:
summary3 = []
for i in label_2017:
    s = ''
    for sentence in df2017[df2017['label'] == i]['summarized_body'].tolist():
        s += sentence
    summary3.append(s)
len(summary3)

20

In [22]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [23]:
def summarize_text(s: str):
    inputs = tokenizer([s], max_length=1024, return_tensors='pt', truncation=True).to(device)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
    output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    torch.cuda.empty_cache()
    return output[0]

In [28]:
for i in range(len(summary1)):
    summary1[i] = summarize_text(summary1[i])
summary1[:5]

['South Korea reported no additional cases of Middle East Respiratory Syndrome for the 14th straight day on Sunday. The number of people diagnosed with MERS in the country remained unchanged at 186 with the death toll also staying flat at 36. The disease has claimed over 530 lives globally, posting a fatality rate of over 36 percent.',
 'In October, some 36,900 babies were born, up 1.1 percent from the same month last year. The rebound follows newborn numbers falling 3.6 percent and 3.7 percent in August and September. South Korea has been trying to push up its birthrate to prevent a decline in the national workforce.',
 'Cho Hyun-ah, former vice president of Korean Air, sentenced to one year in prison. She caused a public uproar by forcing a cabin crew chief to disembark from a flight. The de facto heiress of the flag carrier was found to have ordered the taxiing plane to return to the gate.',
 'Seoul-Tokyo ties have plunged to lowest levels in recent years mainly due to the sex slave

In [29]:
for i in range(len(summary2)):
    summary2[i] = summarize_text(summary2[i])
summary2[:5]

["Radio Pyongyang, the North's state-run radio station, started broadcasting messages at 1:15 a.m. The content was the same as those transmitted in the early hours of Dec. 16. Broadcasts of mysterious numbers are considered a kind of book cipher. Spies could decode numbers to get orders by using a reference book.",
 'Some 200 chickens were found dead on Monday morning in Eumseong, South Chungcheong Province. A total of 15 poultry farms were confirmed with AI cases, while eight other farms are under examination. This is the first time that South Korea has reported an outbreak of the H5N6 strain of bird flu.',
 'Voter turnout in South Korea’s parliamentary election stood at 46.5 percent as of 3 p.m. The official voter turnout is higher than the previous general elections in 2012. It also marks the highest turnout in 12 years since 63.9 percent was recorded in the parliamentary elections in 1996.',
 'Walkway along Deoksu Palace in downtown Seoul will be restored 132 years after it was cut

In [30]:
for i in range(len(summary3)):
    summary3[i] = summarize_text(summary3[i])
summary3[:5]

['Temperatures across the country plummeted about 10 degrees Celsius below Tuesday’s. Cold wave advisories are in effect in most parts of Gangwon Province, Gyeonggi Province, North Chungcheong Province and northern parts of inland North Gyeongsang Province. Cold snap will peak Thursday, while the KMA expected temperature would rebound from Friday.',
 'A South Korean research team says it has uncovered a large skin impression on a sauropod dinosaur footprint. The find, measuring over 50 centimeters in diameter, is the largest-ever skin impression in a dinosaur footprint on record worldwide. South Korea confirmed to have more than 47,000 indigenous species of animals and plants for the first time in 20 years.',
 'South Korean scientists have developed an adhesive patch inspired by protuberances in the suction cups of octopuses. The patch can be attached and detached up to 1,000 times without the need for any adhesive material. A group of South Korean scientists has developed a convenient

In [33]:
# Export to csv
top20sum = pd.DataFrame(summary1, columns=[2015])
top20sum['label2015'] = label_2015
top20sum['freq2015'] = freq_label(df2015, label_2015)

top20sum[2016] = summary2
top20sum['label2016'] = label_2016
top20sum['freq2016'] = freq_label(df2016, label_2016)

top20sum[2017] = summary3
top20sum['label2017'] = label_2017
top20sum['freq2017'] = freq_label(df2017, label_2017)

top20sum.to_csv("data/top20sum.csv", index=False)
top20sum

Unnamed: 0,2015,label2015,freq2015,2016,label2016,freq2016,2017,label2017,freq2017
0,South Korea reported no additional cases of Mi...,2957,6,"Radio Pyongyang, the North's state-run radio s...",16,5,Temperatures across the country plummeted abou...,67,11
1,"In October, some 36,900 babies were born, up 1...",132,5,Some 200 chickens were found dead on Monday mo...,518,4,A South Korean research team says it has uncov...,617,7
2,"Cho Hyun-ah, former vice president of Korean A...",6018,4,Voter turnout in South Korea’s parliamentary e...,5095,4,South Korean scientists have developed an adhe...,4735,5
3,Seoul-Tokyo ties have plunged to lowest levels...,5457,4,Walkway along Deoksu Palace in downtown Seoul ...,705,3,"About 30,300 babies were born in May, down 11....",3708,5
4,South Korea's top financial regulator said Fri...,5039,4,Pigs at two swine farms in Nonsan in the centr...,5782,3,"The search for the missing South Korean ship, ...",6654,4
5,Activists from the Humane Society Internationa...,3617,3,Suh Ye-won is the director of the state-run Na...,7166,3,The H5N6-strain bird flu was detected on a far...,5,4
6,The worst winter seasonal yellow dust in five ...,5864,3,North Korea renewed its calls for peace treaty...,6234,3,"100 Seoul residents, art connoisseurs and city...",4112,3
7,101 Audi owners filed the suit with a Seoul d...,1224,3,Some 1.7 million people gathered in central Se...,431,3,Moon Jae-in will ask a parliamentary committee...,911,3
8,The body of a 47-year-old man was found 7 kilo...,1840,3,South Korea on Thursday released a set of meas...,4376,2,South Korean mixed martial arts fighter Bang T...,813,3
9,The 8.5 trillion won ($8.3 billion) project ca...,5854,3,The ejection test of a KN-11 missile from a su...,7255,2,A 33-year-old Korean woman was confirmed to ha...,5035,3


In [34]:
# Original data with label
df2015.to_csv("data/df2015.csv", index=False)
df2016.to_csv("data/df2016.csv", index=False)
df2017.to_csv("data/df2017.csv", index=False)