In [1]:
import pandas as pd
import numpy as np
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

import spacy
from spacy.matcher import Matcher

import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from transformers import pipeline

from time import time

In [2]:
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []
for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)
df.shape

(23769, 6)

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [5]:
def summarize_text(s: str):
    inputs = tokenizer([s], max_length=1024, return_tensors='pt', truncation=True).to(device)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
    output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    torch.cuda.empty_cache()
    return output[0]

df['summarized_body'] = df['body'][:5].apply(lambda x: summarize_text(x))

In [None]:
# Load
df['summarized_body'] = pd.read_csv('summarized_data.csv')
chkpt = 0
total_chkpt = 1000
# Transform
chnk = int(len(df['body'])/total_chkpt)


for i in range(chkpt,total_chkpt):
    if i != total_chkpt-1:
        df['summarized_body'][i*chnk:(i+1)*chnk] = df['body'][i*chnk:(i+1)*chnk].apply(lambda x: summarize_text(x))
        
    else:
        df['summarized_body'][i*chnk:] = df['body'][i*chnk:].apply(lambda x: summarize_text(x))
    print("Saved checkpoint {}".format(i))
    df['summarized_body'].to_csv('summarized_data.csv', index=False)
    

Saved checkpoint 0
Saved checkpoint 1
Saved checkpoint 2
Saved checkpoint 3
Saved checkpoint 4
Saved checkpoint 5
Saved checkpoint 6
Saved checkpoint 7
Saved checkpoint 8
Saved checkpoint 9
Saved checkpoint 10
Saved checkpoint 11
Saved checkpoint 12
Saved checkpoint 13
Saved checkpoint 14


In [None]:
df.to_csv('preprocessed_data.csv', index=False)