In [1]:
!pip install openai transformers datasets rouge_score nltk

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a1884dafb169f4bd55799121075f2e6eac5a5b5a190e7ec35b25eb33fb39d833
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score import rouge_scorer
import re
import json


In [3]:
# Load dataset (first 100)
dataset = load_dataset("sobamchan/aclsum", split="test[:100]")

# Replace model name for Qwen or LLaMA
summarizer = pipeline("text-generation", model="Qwen/Qwen1.5-4B")

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-4B")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/50 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [4]:
def vanilla_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)
    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Return only a JSON object in the following format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''



def least_to_most_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)
    return f'''You are a logical assistant.

Step 1: Identify the key contributions of the paper (e.g., Challenge, Approach, Outcome).
Step 2: Select the most relevant sentences that correspond to those contributions.

Input:
{input_text}

Now return only a valid JSON object in this format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''


def self_ask_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)
    return f'''You are a thoughtful assistant helping with extractive summarization.

Input:
{input_text}

First ask yourself: "What are the most important contributions of this paper?"
Then ask: "Which specific sentences express or support those contributions?"

Return only a valid JSON object like this:
{{"selected_sentences": [list_of_sentence_numbers]}}'''



In [5]:
# --- Utility ---
def get_sentences(doc):
    raw = doc["document"]
    sentences = re.split(r'(?<=[.!?]) +', raw.strip())
    return [s for s in sentences if len(s) > 10]

# --- Evaluation containers ---
y_true_per_doc = []
y_pred_per_doc = []


In [None]:
# --- Inference Loop ---
for doc in dataset:
    sentences = get_sentences(doc)
    print(f"\n📄 {doc['id']}: {len(sentences)} sentences")

    # limit number of sentences
    # sentences = sentences[:20]

    gold_indices = doc["challenge"]  # Change to 'approach' or 'outcome' as needed

    print("Building prompt...")
    prompt = self_ask_prompt(sentences)

    print("Prompting model...")

    try:
        response = summarizer(prompt, max_new_tokens=256)[0]["generated_text"]
        print(f"Doc ID: {doc['id']}")
        print("Raw model output:", response)

        # Extract JSON object from model output
        match = re.search(r"\{.*\}", response)
        if match:
            try:
                prediction_json = json.loads(match.group())
                pred_indices = prediction_json.get("selected_sentences", [])

                # Handle case when pred_indices is a string (e.g., "1, 3, 5")
                if isinstance(pred_indices, str):
                    pred_indices = [int(s.strip()) for s in pred_indices.split(",") if s.strip().isdigit()]

                # Convert any stringified numbers to integers
                elif isinstance(pred_indices, list):
                    pred_indices = [int(i) for i in pred_indices if isinstance(i, int) or (isinstance(i, str) and i.strip().isdigit())]

                else:
                    raise ValueError("selected_sentences must be a list or a comma-separated string")

            except Exception as e:
                print(f"JSON parsing failed for doc {doc['id']}:\n{match.group()}\nError: {e}")
                pred_indices = []
        else:
            print("No valid JSON object found in response.")
            pred_indices = []

        # Binarize vectors for evaluation
        gold_vector = [1 if i in gold_indices else 0 for i in range(len(sentences))]
        pred_vector = [1 if i in pred_indices else 0 for i in range(len(sentences))]

        y_true_per_doc.append(gold_vector)
        y_pred_per_doc.append(pred_vector)

    except Exception as e:
        print(f"Failed on doc {doc['id']}: {e}")



📄 E09-1056: 47 sentences
🛠️ Building prompt...
🚀 Prompting model...


In [None]:

# --- Final Evaluation ---
from itertools import chain

y_true_all = list(chain.from_iterable(y_true_per_doc))
y_pred_all = list(chain.from_iterable(y_pred_per_doc))

print(f"Precision: {precision_score(y_true_all, y_pred_all, zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_true_all, y_pred_all, zero_division=0):.4f}")
print(f"F1-score:  {f1_score(y_true_all, y_pred_all, zero_division=0):.4f}")


# ROUGE against abstractive summaries
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = []
for i, doc in enumerate(dataset):
    sentences = get_sentences(doc)
    try:
        pred_vector = y_pred_per_doc[i]
    except IndexError:
        continue  # doc was skipped

    pred_sentences = [sentences[j] for j in range(len(sentences)) if pred_vector[j] == 1]
    gold_text = doc["abstractive_challenge"]
    pred_text = " ".join(pred_sentences)
    rouge = scorer.score(gold_text, pred_text)
    rouge_scores.append(rouge)

avg_rougeL = sum([r['rougeL'].fmeasure for r in rouge_scores]) / len(rouge_scores)
print(f"Avg ROUGE-L F1: {avg_rougeL:.4f}")