In [1]:
import pandas as pd
import numpy as np
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher

import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from transformers import pipeline

from time import time

In [2]:
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []
for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)
df.shape

(23769, 6)

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [4]:
def summarize_text(s: str, length=10):
    inputs = tokenizer([s], max_length=1024, return_tensors='pt', truncation=True).to(device)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=length, early_stopping=True)
    output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    torch.cuda.empty_cache()
    return output[0]

# df['summarized_body'] = df['body'][:5].apply(lambda x: summarize_text(x))

In [23]:
# [(67, 11),
#  (617, 7),
#  (3706, 5),
#  (4733, 5),
#  (5, 4),
#  (6652, 4),
#  (813, 3),
#  (911, 3),
#  (4110, 3),
#  (5033, 3),
#  (6414, 3),
#  (44, 2),
#  (113, 2),
#  (114, 2),
#  (190, 2),
#  (313, 2),
#  (334, 2),
#  (591, 2),
#  (606, 2),
#  (849, 2)]

from collections import Counter
st = ' '.join(df['title'][[630,4376,5734,6953,7630,7954,8286]].tolist())

In [25]:
summarized_630 = summarize_text(df['title'][630], length=10)
summarized_630

'Largest-ever skin impression'

In [26]:
summarized_sample = summarize_text(st, length=100)

In [27]:
summarized_sample

"Largest-ever skin impression on dinosaur footprint found in S. Korea. Korea confirmed to have 47,000 indigenous species of animals, plants. Seoul Zoo eager to restore Korean leopards. South Korea's Sewolâ€™s final trip Endangered brown long-eared bat found."

In [32]:
pd.set_option('max_colwidth', 2000)

In [36]:
sample = df['body'][[630,4376,5734,6953,7630,7954,8286]]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample)
X.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [1, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 6, 1, 2],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [2, 1, 0, ..., 0, 0, 0]])

In [None]:
# # Load
# df['summarized_body'] = pd.read_csv('summarized_data.csv')
# chkpt = 380
# total_chkpt = 1000
# # Transform
# chnk = int(len(df['body'])/total_chkpt)


# for i in range(chkpt,total_chkpt):
#     if i != total_chkpt-1:
#         df['summarized_body'][i*chnk:(i+1)*chnk] = df['body'][i*chnk:(i+1)*chnk].apply(lambda x: summarize_text(x))
        
#     else:
#         df['summarized_body'][i*chnk:] = df['body'][i*chnk:].apply(lambda x: summarize_text(x))
#     df['summarized_body'].to_csv('summarized_data.csv', index=False)
#     print("Saved checkpoint {}".format(i))
    