In [3]:
pip install datasets



In [4]:
pip install rouge_score



In [5]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt

# 1. Load Dataset
ds = load_dataset("csebuetnlp/xlsum", "indonesian")
data = pd.DataFrame(ds['train'])

data.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

xlsum.py:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/74.1M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/8.07M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/7.95M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38242 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4780 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4780 [00:00<?, ? examples/s]

Unnamed: 0,id,url,title,summary,text
0,media-49647079,https://www.bbc.com/indonesia/media-49647079,"Gajah mengamuk saat upacara keagamaan, 17 oran...",Seekor gajah mendadak mengamuk saat prosesi ta...,"Dilaporkan dua orang terluka cukup serius, sem..."
1,indonesia-43826943,https://www.bbc.com/indonesia/indonesia-43826943,Apa alasan pemerintah pangkas 14 dalam daftar ...,Presiden Jokowi memutuskan untuk menghapus 14 ...,Proyek MRT Sudirman- Lebak Bulus tengah dikerj...
2,160404_dunia_israel_palestina,https://www.bbc.com/indonesia/dunia/2016/04/16...,Rumah warga Palestina pembunuh polisi Israel d...,Pihak keamanan Israel menghancurkan rumah tiga...,Israel secara berkala menghancurkan rumah kelu...
3,160819_majalah_australia_sedekah,https://www.bbc.com/indonesia/majalah/2016/08/...,"Beri uang pada pengemis, PM Australia picu per...",Ada istilah 'tidak ada perbuatan baik yang tid...,Sebagian memandang Turnbull pelit. Dia adalah ...
4,indonesia-43459438,https://www.bbc.com/indonesia/indonesia-43459438,"Dua anak Soeharto di panggung politik, indikas...",Diusulkannya Titiek Soeharto -putri mantan pre...,Tommy Soeharto dan Titiek Soeharto merupakan k...


In [6]:
# 2. Prepare Text and Summary Data
texts = data['text'].tolist()
summaries = data['summary'].tolist()

In [7]:
# 3. Data Preprocessing
vocab_size = 5000
max_length = 300
trunc_type = 'post'
padding_type = 'post'

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts + summaries)
text_sequences = tokenizer.texts_to_sequences(texts)
summary_sequences = tokenizer.texts_to_sequences(summaries)

text_padded = pad_sequences(text_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
summary_padded = pad_sequences(summary_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(text_padded, summary_padded, test_size=0.2, random_state=42)


In [8]:
# 4. Define SimpleRNN Model
embedding_dim = 64
rnn_units = 128

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    SimpleRNN(rnn_units, return_sequences=True),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Adjust target shape for training
y_train = np.expand_dims(y_train, axis=-1)
y_test = np.expand_dims(y_test, axis=-1)



In [9]:
# Train the model and save training history
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2667s[0m 3s/step - accuracy: 0.8995 - loss: 1.5267 - val_accuracy: 0.9298 - val_loss: 0.5478
Epoch 2/5
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2724s[0m 3s/step - accuracy: 0.9296 - loss: 0.5472 - val_accuracy: 0.9299 - val_loss: 0.5396
Epoch 3/5
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2650s[0m 3s/step - accuracy: 0.9296 - loss: 0.5407 - val_accuracy: 0.9296 - val_loss: 0.5422
Epoch 4/5
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2685s[0m 3s/step - accuracy: 0.9300 - loss: 0.5335 - val_accuracy: 0.9298 - val_loss: 0.5363
Epoch 5/5
[1m957/957[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2684s[0m 3s/step - accuracy: 0.9298 - loss: 0.5326 - val_accuracy: 0.9299 - val_loss: 0.5344


In [39]:
# Save training, validation accuracy, and loss history to CSV
train_results = pd.DataFrame({
    "accuracy": history.history['accuracy'],
    "val_accuracy": history.history['val_accuracy'],
    "loss": history.history['loss'],
    "val_loss": history.history['val_loss']
})
train_results.to_csv("/content/drive/MyDrive/NLP ALGHA/SUMMARY/train_results.csv", index=False)

# Print training results summary
print("\nTraining and Validation Results:")
print(train_results)


Training and Validation Results:
   accuracy  val_accuracy      loss  val_loss
0  0.921137      0.929759  0.864994  0.547770
1  0.929638      0.929894  0.544761  0.539643
2  0.929789      0.929649  0.538624  0.542215
3  0.929816      0.929828  0.535343  0.536326
4  0.929962      0.929876  0.531317  0.534400


In [40]:
# 5. Function to Generate Summary
def clean_summary(predicted_sequence):
    words = tokenizer.sequences_to_texts(predicted_sequence)
    summary = ' '.join(words).replace(' ', '').strip()
    return ' '.join(summary.split()[:50])  # Limit summary to first 50 tokens

def generate_summary(input_sequence):
    prediction = model.predict(np.array([input_sequence]))
    predicted_sequence = np.argmax(prediction, axis=-1)
    return clean_summary(predicted_sequence)

In [42]:
# 6. Evaluate Model with ROUGE Scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

# Compute ROUGE for a subset of test data
for i in range(100):
    predicted_summary = generate_summary(X_test[i])
    actual_summary = tokenizer.sequences_to_texts([y_test[i].flatten().tolist()])[0]

    # Calculate ROUGE scores
    scores = scorer.score(actual_summary, predicted_summary)

    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Calculate and save average ROUGE scores to CSV
average_rouge_scores = {key: np.mean(value) for key, value in rouge_scores.items()}
average_rouge_df = pd.DataFrame(list(average_rouge_scores.items()), columns=["ROUGE Type", "Average F1 Score"])
average_rouge_df.to_csv("/content/drive/MyDrive/NLP ALGHA/SUMMARY/average_rouge_scores.csv", index=False)

# Print Average ROUGE Scores
print("\nAverage ROUGE Scores:")
print(average_rouge_df)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41

In [43]:
# 7. Single Example Evaluation
index = 0
input_text = X_test[index]
reference_summary = tokenizer.sequences_to_texts([y_test[index].flatten().tolist()])[0]
generated_summary = generate_summary(input_text)

# Calculate ROUGE scores for the example
example_scores = scorer.score(reference_summary, generated_summary)
example_scores_formatted = {
    rouge_type: {
        "Precision": score.precision,
        "Recall": score.recall,
        "F1 Score": score.fmeasure
    } for rouge_type, score in example_scores.items()
}

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step


In [44]:
# Convert example scores to DataFrame and save as CSV
example_scores_df = pd.DataFrame.from_dict(example_scores_formatted, orient="index")
example_scores_df.to_csv("/content/drive/MyDrive/NLP ALGHA/SUMMARY/example_evaluation_scores.csv")

# Save reference and generated summaries as CSV for easy viewing
example_output_df = pd.DataFrame({
    "Reference Summary": [reference_summary],
    "Generated Summary": [generated_summary]
})
example_output_df.to_csv("/content/drive/MyDrive/NLP ALGHA/SUMMARY/example_summary.csv", index=False)

# Print Example Evaluation
print("\nExample Evaluation:")
print("Reference Summary:", reference_summary)
print("\nGenerated Summary:", generated_summary)
print("\nROUGE Scores for the Example:")
print(example_scores_df)

print("\nTraining history, ROUGE scores, and example evaluations have been saved as CSV files.")


Example Evaluation:
Reference Summary: festival film <OOV> di korea selatan akan segera <OOV> daftar 100 film asia yang dianggap terbaik sepanjang masa <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <