In [4]:
import re
import pandas as pd
from newspaper import Article
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

In [None]:
df = pd.read_csv("../data/history.csv", delimiter=";")
df.head()

Unnamed: 0,url,title,content,content language,summary,summary language
0,https://edition.cnn.com/world/live-news/israel...,Live updates: Israel carries out attack in Qat...,A man looking at smoke billowing after explosi...,English,Israel has launched a military strike on Qatar...,English
1,https://www.cnnindonesia.com/internasional/202...,"Kerusuhan di Nepal, KBRI Dhaka Pastikan WNI Aman",--\n\nAksi protes yang dilakukan oleh Generasi...,Indonesia,KBRI Dhaka mengeluarkan imbauan kepada seluruh...,Indonesia
2,https://www.cnnindonesia.com/internasional/202...,"Presiden Mundur Susul PM, Nepal Chaos Terancam...",--\n\nPresiden Nepal Ram Chandra Poudel mengun...,Indonesia,Presiden Nepal Ram Chandra Poudel mengundurkan...,Indonesia
3,https://edition.cnn.com/2025/09/09/europe/russ...,Russian aerial bomb kills at least 24 civilian...,Russia War in Ukraine See all topics Follow\n\...,English,This is a full transcript of the killings of a...,English
4,https://edition.cnn.com/2025/09/09/europe/fran...,France’s government has collapsed again. How d...,Paris —\n\nFrance’s prime minister has quit af...,English,Why do French governments keep collapsing? The...,English


### Keywords

In [51]:
# Ambil kolom content
texts = df["content"].dropna().tolist()

In [52]:
nltk.download("stopwords")
stopwords_id = stopwords.words("indonesian")
stopwords_en = stopwords.words("english")
stopwords_all = set(stopwords_id + stopwords_en)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arvio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\u00C0-\u024F\u1E00-\u1EFF]+", " ", text)  # hapus simbol
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords_all and len(t) > 2]
    return " ".join(tokens)

texts_clean = [clean_text(t) for t in texts]

In [73]:
words = " ".join(texts_clean).split()
word_freq = Counter(words).most_common(10)

print("20 kata paling sering muncul:")
for w, f in word_freq:
    print(w, f)

20 kata paling sering muncul:
apple 41
iphone 41
tariffs 30
new 26
court 26
trump 21
menteri 19
qatar 18
said 18
president 18


### Topic

In [None]:
# embedding_model = SentenceTransformer("all-mpnet-base-v2")

# save_path = "../models/embedding/all-mpnet-base-v2"
# embedding_model.save(save_path)

In [None]:
# Buat model embedding multilingual
embedding_model = SentenceTransformer("../models/embedding/all-mpnet-base-v2")

umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric="cosine")
hdbscan_model = HDBSCAN(min_cluster_size=2, metric="euclidean", cluster_selection_method="eom")

# Fit BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    nr_topics="auto"
)

topics, probs = topic_model.fit_transform(texts_clean)

In [None]:
def topics_user(topic_model, n_words=6, n_examples=1):
    topic_info = topic_model.get_topic_info()
    for _, row in topic_info.iterrows():
        topic_id = row["Topic"]
        count = row["Count"]

        # Lewati outlier (-1) biar fokus ke topik jelas
        if topic_id == -1:
            continue

        # Kata kunci topik
        keywords = [w for w, _ in topic_model.get_topic(topic_id)[:n_words]]

        # Contoh kalimat nyata
        examples = topic_model.get_representative_docs(topic_id)[:n_examples]

        # Cetak hasil
        print("="*60)
        print(f"📌 Topik {topic_id} (Jumlah dokumen: {count})")
        print(f"🔑 Kata kunci: {', '.join(keywords)}")
        print("📝 Contoh kalimat:")
        for ex in examples:
            print(f"   - {ex[:120]}...")  # dipotong biar tidak terlalu panjang
    print("="*60)

# 5. Cetak hasil untuk user
topics_user(topic_model, n_words=6, n_examples=2)

📌 Topik 0 (Jumlah dokumen: 6)
🔑 Kata kunci: apple, iphone, tariffs, new, court, trump
📝 Contoh kalimat:
   - paris france prime minister quit losing confidence vote toppled government plunging country new political crisis françoi...
   - tech giants corporate news tech news see topics follow new york apple announced first major redesign iphone years tuesda...
📌 Topik 1 (Jumlah dokumen: 6)
🔑 Kata kunci: menteri, nepal, prabowo, sri, mulyani, keuangan
📝 Contoh kalimat:
   - menteri keuangan menkeu purbaya yudhi sadewa memuji kinerja ekonomi pemerintahan presiden prabowo subianto mengucap syuk...
   - media asing reuters mengungkap detik detik presiden prabowo subianto memutuskan mencopot sri mulyani posisinya menteri k...


### Load model summarizer

In [4]:
model_name = "csebuetnlp/mT5_multilingual_XLSum"

save_path_summarizer = "../models/summarizer/csebuetnlp/mT5_multilingual_XLSum"

# download ke cache
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# save model
tokenizer.save_pretrained(save_path_summarizer)
model.save_pretrained(save_path_summarizer)

print("Model saved to", save_path_summarizer)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Model saved to ../models/summarizer/csebuetnlp/mT5_multilingual_XLSum


### Load model translator

In [7]:
# Inggris to Indonesia
engtoid = pipeline("translation", model="Helsinki-NLP/opus-mt-en-id")
# Indonesia to Inggris
idtoeng = pipeline("translation", model="Helsinki-NLP/opus-mt-id-en")

# tempat simpan path
save_path_translator_en_id = "../models/translator/Helsinki-NLP/opus-mt-en-id"
save_path_translator_id_en = "../models/translator/Helsinki-NLP/opus-mt-id-en"

# save model en-id
engtoid.model.save_pretrained(save_path_translator_en_id)
engtoid.tokenizer.save_pretrained(save_path_translator_en_id)

# save model id-en
idtoeng.model.save_pretrained(save_path_translator_id_en)
idtoeng.tokenizer.save_pretrained(save_path_translator_id_en)

Device set to use cpu
Device set to use cpu


('../models/translator/Helsinki-NLP/opus-mt-id-en\\tokenizer_config.json',
 '../models/translator/Helsinki-NLP/opus-mt-id-en\\special_tokens_map.json',
 '../models/translator/Helsinki-NLP/opus-mt-id-en\\vocab.json',
 '../models/translator/Helsinki-NLP/opus-mt-id-en\\source.spm',
 '../models/translator/Helsinki-NLP/opus-mt-id-en\\target.spm',
 '../models/translator/Helsinki-NLP/opus-mt-id-en\\added_tokens.json')

In [63]:
def scrape_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.title, article.text

In [64]:
url = "https://www.cnnindonesia.com/nasional/20250904130746-12-1270094/kejagung-tetapkan-nadiem-makarim-tersangka-kasus-pengadaan-laptop"

title, content = scrape_article(url)
print("Title:", title)
print("Content:", content)

Title: Kejagung Tetapkan Nadiem Makarim Tersangka Kasus Pengadaan Laptop
Content: --

Eks Mendikbudristek Nadiem Makarim ditetapkan sebagai tersangka oleh Kejaksaan Agung (Kejagung) dalam kasus dugaan korupsi Program Digitalisasi Pendidikan di Kemendikbudristek periode 2019-2022.

"Dari hasil pendalaman dan alat bukti yang ada, pada sore ini telah menetapkan tersangka baru dengan inisial NAM (Nadiem Anwar Makarim)," ujar Kapuspen Kejagung Anang Supriatna dalam konferensi pers di kompleks Kejagung, Jakarta Selatan, Kamis (4/9).

Sebelumnya, pada Kamis pagi ini, Nadiem mendatangi Kejagung untuk diperiksa ketiga kalinya dalam kasus tersebut.

ADVERTISEMENT SCROLL TO CONTINUE WITH CONTENT

Pantauan CNNIndonesia.com, Nadiem datang bersama kuasa hukumnya Hotman Paris Hutapea. Eks bos Gojek itu membawa tas jinjing hitam ke dalam gedung Pidsus Kejagung dengan kemeja hijau.

Sebelum hari ini Nadiem telah dua kali diperiksa sebagai saksi oleh Kejagung yakni pada Senin (23/6) dan Selasa (15/7).



In [65]:
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

In [67]:
model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [68]:
input_ids = tokenizer(
    [WHITESPACE_HANDLER(content)],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)["input_ids"]

output_ids = model.generate(
    input_ids=input_ids,
    max_length=84,
    no_repeat_ngram_size=2,
    num_beams=4
)[0]

summary = tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

In [69]:
print(summary)

Eks Mendikbudristek Nadiem Makarim ditetapkan sebagai tersangka oleh Kejaksaan Agung (Kejagung) dalam kasus dugaan korupsi Program Digitalisasi Pendidikan di Kemendikbud periode 2019-2023.
