https://huggingface.co/docs/transformers/en/model_doc/encoder-decoder

# Dataset

In [2]:
import os
import json
import pprint
import re
import pickle
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from wordcloud import WordCloud
import json

In [6]:
def load_json_files_from_directory(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data.append(json.load(file))
    return data

In [8]:
folder_data = "E:\\Dataset\\liputan6_data2"
dataset_type = "dataset"

train_dir = os.path.join(folder_data, dataset_type, "train")
valid_dir = os.path.join(folder_data, dataset_type, "dev")
test_dir = os.path.join(folder_data, dataset_type, "test")

In [10]:
train_dir

'E:\\Dataset\\liputan6_data2\\dataset\\train'

In [12]:
train_data = load_json_files_from_directory(train_dir)
valid_data = load_json_files_from_directory(valid_dir)
test_data = load_json_files_from_directory(test_dir)

In [13]:
train_data[0]

{'id': 100007,
 'url': 'https://www.liputan6.com/news/read/100007/terlibat-pembalakan-kayu--kapuskopad-trikora-dinonaktifkan',
 'clean_article': [['Liputan6',
   '.',
   'com',
   ',',
   'Jayapura',
   ':',
   'Kolonel',
   'Richard',
   'Ginting',
   'dinonaktifkan',
   'dari',
   'jabatan',
   'Kepala',
   'Pusat',
   'Koperasi',
   'TNI',
   'Angkatan',
   'Darat',
   'Komando',
   'Daerah',
   'Militer',
   'XVII',
   'Trikora',
   '.'],
  ['Mantan',
   'Kepala',
   'Staf',
   'Komando',
   'Resor',
   'Militer',
   'Sorong',
   ',',
   'Papua',
   ',',
   'ini',
   'diduga',
   'terlibat',
   'dalam',
   'kegiatan',
   'pembalakan',
   'kayu',
   '.'],
  ['Hal',
   'itu',
   'diungkapkan',
   'Panglima',
   'Daerah',
   'Militer',
   'XVII',
   'Trikora',
   'Mayor',
   'Jenderal',
   'TNI',
   'Nurdin',
   'Zainal',
   'usai',
   'serah',
   'terima',
   'jabatan',
   'Komandan',
   'Korem',
   '172',
   'Praja',
   'Wirayakti',
   'di',
   'Jayapura',
   ',',
   'Papua',
   ','

In [16]:
train_set = pd.json_normalize(train_data)
valid_set = pd.json_normalize(valid_data)
test_set = pd.json_normalize(test_data)

In [18]:
def normalize_text(tokenized_text):
    normalized_text = ""
    for sentence in tokenized_text:
        for word in sentence:
            if word in [".", ",", ":", ";", "!", "?", ")", "]"]:
                normalized_text = normalized_text.rstrip() + word + " "
            elif word in ["(", "["]:
                normalized_text += word
            else:
                normalized_text += word + " "
    return normalized_text.strip()


train_set['combined_clean_article'] = train_set['clean_article'].apply(normalize_text)
train_set['combined_clean_summary'] = train_set['clean_summary'].apply(normalize_text)

In [20]:
train_set.head()

Unnamed: 0,id,url,clean_article,clean_summary,extractive_summary,combined_clean_article,combined_clean_summary
0,100007,https://www.liputan6.com/news/read/100007/terl...,"[[Liputan6, ., com, ,, Jayapura, :, Kolonel, R...","[[Kolonel, Richard, Ginting, dinyatakan, terli...","[4, 5]","Liputan6. com, Jayapura: Kolonel Richard Ginti...",Kolonel Richard Ginting dinyatakan terlibat bi...
1,100013,https://www.liputan6.com/news/read/100013/penc...,"[[Liputan6, ., com, ,, Jakarta, :, Acong, dita...","[[Acong, dipergoki, rekan, sesama, satpam, mem...","[2, 4]","Liputan6. com, Jakarta: Acong ditangkap polisi...",Acong dipergoki rekan sesama satpam membawa ka...
2,100023,https://www.liputan6.com/news/read/100023/sby-...,"[[Liputan6, ., com, ,, Jakarta, :, Presiden, S...","[[Jamuan, makan, malam, bagi, peserta, KAA, di...","[0, 2]","Liputan6. com, Jakarta: Presiden Susilo Bamban...",Jamuan makan malam bagi peserta KAA diadakan d...
3,100071,https://www.liputan6.com/news/read/100071/peng...,"[[Liputan6, ., com, ,, Jakarta, :, Lebih, dari...","[[Mereka, berdemonstrasi, di, saat, para, dele...","[1, 8]","Liputan6. com, Jakarta: Lebih dari seratus pen...",Mereka berdemonstrasi di saat para delegasi me...
4,100077,https://www.liputan6.com/news/read/100077/mant...,"[[Liputan6, ., com, ,, Tangerang, :, Mantan, D...","[[TMM, ,, mantan, Deputi, Kepala, Badan, Penye...","[0, 1]","Liputan6. com, Tangerang: Mantan Deputi Kepala...","TMM, mantan Deputi Kepala Badan Penyehatan Per..."


In [22]:
train_set.sample(1)[["combined_clean_article", "combined_clean_summary"]].values

array([['Liputan6. com, Jakarta: Pemerintah berencana menaikkan harga sejumlah komoditi dasar pada 2003. Komoditi yang akan dinaikkan harganya adalah bahan bakar minyak, listrik, telepon, tarif tol, dan pajak kendaraan bermotor. Kenaikan tarif komponen ekonomi itu berkisar antara 15 persen hingga 80 persen. Komoditi dasar ini menyumbangkan 30 persen pada laju kenaikan inflasi. Dengan demikian, tahun depan daya beli masyarakat akan turun cukup signifikan. Namun, menurut Direktur Badan Pusat Statistik (BPS) Ali Rosidi di Jakarta, pekan terakhir November ini, secara keseluruhan kenaikan inflasi langsung tidak terlalu signifikan. Ali mencontohkan, tarif listrik yang dinaikkan sebesar enam persen hanya akan meningkatkan inflasi sebesar 0, 14 persen [baca: Pengusaha Mengeluhkan Rencana Kenaikan TDL]. Selain itu, dampak kenaikan tarif untuk setiap daerah tergantung pada bobot komoditas tersebut. Kendati demikian, permasalahan yang lebih sulit justru menghitung dampak tidak langsung dari kenai

In [24]:
valid_set['combined_clean_article'] = valid_set['clean_article'].apply(normalize_text)
valid_set['combined_clean_summary'] = valid_set['clean_summary'].apply(normalize_text)
test_set['combined_clean_article'] = test_set['clean_article'].apply(normalize_text)
test_set['combined_clean_summary'] = test_set['clean_summary'].apply(normalize_text)

In [26]:
train_set.shape, valid_set.shape, test_set.shape

((10000, 7), (2000, 7), (2000, 7))

In [28]:
folder_data = "E:\\Dataset\\liputan6_data2\\"
train_set.to_csv(folder_data+'train_set.csv', index=False)
valid_set.to_csv(folder_data+'valid_set.csv', index=False)
test_set.to_csv(folder_data+'test_set.csv', index=False)