## 1. Environment Setup and Library Installation

In [28]:
%pip install -q \
  spacy==3.7.4 \
  spacy-transformers==1.3.4 \
  "transformers==4.39.3" \
  "tokenizers==0.15.2" \
  "datasets==2.21.0" \
  "tqdm==4.66.5" \
  "nltk==3.9.1" \
  "scikit-learn==1.5.2" \
  "numpy==1.26.4"
%pip install -q --no-cache-dir \
  https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl

[31mERROR: Cannot install spacy-transformers==1.3.4 and transformers==4.39.3 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m357.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hNote: you may need to restart the kernel to use updated packages.


In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset, concatenate_datasets
from tqdm.notebook import tqdm
import nltk
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer
from nltk.chunk import ne_chunk
from nltk.chunk.regexp import RegexpParser
from sklearn.metrics.pairwise import cosine_similarity
import re
import warnings
import spacy
from spacy.lang.en import English
from functools import lru_cache
from transformers import AutoModel
warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

print("Environment setup completed!")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Environment setup completed!
CUDA available: True
GPU: Tesla P100-PCIE-16GB


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!


## 2. Model and Dataset Loading

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
MODEL_NAME = 'allenai/scibert_scivocab_uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()
print("Loading Inspec dataset...")
dataset = load_dataset("taln-ls2n/inspec",trust_remote_code = True)
combined_dataset = concatenate_datasets([
    dataset['train'],
    dataset['validation'], 
    dataset['test']
])
print(f"Combined dataset size: {len(combined_dataset)} documents")
print("Sample document structure:")
print(combined_dataset[0].keys())

Using device: cuda
Loading Inspec dataset...
Combined dataset size: 2000 documents
Sample document structure:
dict_keys(['id', 'title', 'abstract', 'keyphrases', 'prmu'])


In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_phrase(phrase):
    if not phrase: return ""
    return " ".join([stemmer.stem(w) for w in phrase.split()])

## 3. Core Utility Functions

In [None]:
import re
import nltk
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'[^\w\s.,;:!?()-]', '', text)
    return text

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 4. Candidate Phrase Extraction

In [None]:
def extract_candidate_phrases(text, min_length=1, max_length=5):
    text = preprocess_text(text)
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    grammar = """  NP:
        {<NN.*|JJ>*<NN.*>}  # Adjective(s)(optional) + Noun(s)"""
    cp = RegexpParser(grammar)
    tree = cp.parse(pos_tags)
    candidates = {}
    current_char_index = 0
    for subtree in tree:
        if hasattr(subtree, 'label') and subtree.label() == 'NP':
            chunk_leaves = subtree.leaves()
            if not (min_length <= len(chunk_leaves) <= max_length):
                continue
            surface_tokens = [w for w, t in chunk_leaves]
            surface_form = " ".join(surface_tokens)
            lemma_tokens = []
            for word, tag in chunk_leaves:
                wn_tag = get_wordnet_pos(tag)
                lemma_tokens.append(lemmatizer.lemmatize(word, wn_tag))
            
            lemma_form = " ".join(lemma_tokens)
            if (lemma_tokens[0] in stop_words or 
                lemma_tokens[-1] in stop_words):
                continue
            start_idx = text.find(surface_form)
            if lemma_form not in candidates:
                candidates[lemma_form] = {
                    'phrase': surface_form,       
                    'position': start_idx,       
                    'length': len(surface_tokens)
                }
            else:
                existing = candidates[lemma_form]
                if len(surface_form) > len(existing['phrase']):
                    existing['phrase'] = surface_form
                if start_idx != -1 and start_idx < existing['position']:
                    existing['position'] = start_idx

    return list(candidates.values())

sample_text = "Machine learning algorithms are powerful computational methods for data analysis. The machine learning algorithm performed well."
test_candidates = extract_candidate_phrases(sample_text)

print(f"Teks Asli: {sample_text}\n")
print("Hasil Ekstraksi Kandidat")
for i, candidate in enumerate(test_candidates):
    print(f"{i+1}. '{candidate['phrase']}' (Posisi: {candidate['position']}, Panjang: {candidate['length']})")

Teks Asli: Machine learning algorithms are powerful computational methods for data analysis. The machine learning algorithm performed well.

Hasil Ekstraksi Kandidat (Deduplikasi Lemma & Bentuk Terpanjang):
1. 'machine learning algorithms' (Posisi: 0, Panjang: 3)
2. 'powerful computational methods' (Posisi: 32, Panjang: 3)
3. 'data analysis' (Posisi: 67, Panjang: 2)
4. 'machine' (Posisi: 0, Panjang: 1)
5. 'algorithm' (Posisi: 17, Panjang: 1)


In [None]:
from tqdm.notebook import tqdm

def check_candidate_metrics(dataset, extractor_function, stemmer_function, max_docs=None):
    total_ground_truth_count = 0
    total_found_in_candidates = 0
    total_candidates_generated = 0 
    
    docs_to_process = dataset
    if max_docs is not None:
        docs_to_process = dataset.select(range(max_docs))
        
    print(f"Starting candidate metrics check on {len(docs_to_process)} documents...")

    for document in tqdm(docs_to_process, desc="Checking Candidates"):
        # 1. Get Ground Truth Keyphrases
        true_keyphrases = document.get('keyphrases', [])
        if not true_keyphrases:
            continue
        stemmed_true = set(stemmer_function(p) for p in true_keyphrases if p.strip())
        total_ground_truth_count += len(stemmed_true)
        title = document.get('title', '')
        abstract = document.get('abstract', '')
        document_text = '. '.join([t for t in [title, abstract] if t.strip()])
        
        if not document_text:
            continue
        try:
            candidates_list = extractor_function(document_text)
            candidate_phrases = [c['phrase'] for c in candidates_list]
            stemmed_candidates = set(stemmer_function(p) for p in candidate_phrases if p.strip())
            total_candidates_generated += len(stemmed_candidates)
            found_phrases = stemmed_true.intersection(stemmed_candidates)
            total_found_in_candidates += len(found_phrases)
            
        except Exception as e:
            tqdm.write(f"⚠️ Error processing document: {e}")
    if total_ground_truth_count == 0:
        return 0.0, 0.0
        
    recall_ceiling = total_found_in_candidates / total_ground_truth_count
    
    precision = 0.0
    if total_candidates_generated > 0:
        precision = total_found_in_candidates / total_candidates_generated
    
    print("\n--- Candidate Evaluation Results (Micro-Average) ---")
    print(f"Total Unique Ground Truth       : {total_ground_truth_count}")
    print(f"Total Candidates Generated      : {total_candidates_generated}")
    print(f"Total Match (Correct)           : {total_found_in_candidates}")
    print("-" * 40)
    print(f"Recall Ceiling : {recall_ceiling * 100:.2f}%")
    print(f"Precision      : {precision * 100:.2f}%")
    
    return recall_ceiling, precision
recall_val, precision_val = check_candidate_metrics(
    combined_dataset, 
    extract_candidate_phrases,  
    stem_phrase,                
    max_docs=None               
)

Running Metrics Test (Recall & Precision)...
Starting candidate metrics check on 2000 documents...


Checking Candidates:   0%|          | 0/2000 [00:00<?, ?it/s]


--- Candidate Evaluation Results (Micro-Average) ---
Total Unique Ground Truth       : 19243
Total Candidates Generated      : 53315
Total Match (Correct)           : 11313
----------------------------------------
Recall Ceiling : 58.79%
Precision      : 21.22%


## 6. Additional Scores Implementation (Theme & Position)

In [None]:
def get_pooled_embeddings_batched(texts, tokenizer, model, device, BATCH_SIZE=64, pooling_strategy='mean'):
    all_embeddings = []
    
    with torch.inference_mode():
            for i in range(0, len(texts), BATCH_SIZE):
                batch_texts = texts[i : i + BATCH_SIZE]
                inputs = tokenizer(
                    batch_texts, 
                    padding='max_length',
                    truncation=True, 
                    max_length=512, 
                    return_tensors='pt'
                ).to(device)
                outputs = model(**inputs, output_hidden_states=True)
                last_hidden_state = outputs.hidden_states[-2]
                attention_mask = inputs['attention_mask']
                if pooling_strategy == 'cls':
                    pooled_embeddings = last_hidden_state[:, 0, :]
                elif pooling_strategy == 'mean':
                    mask_expanded = attention_mask.unsqueeze(-1).expand_as(last_hidden_state)
                    sum_embeddings = torch.sum(last_hidden_state * mask_expanded, dim=1)
                    sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
                    pooled_embeddings = sum_embeddings / sum_mask.unsqueeze(-1)
                all_embeddings.append(pooled_embeddings.cpu().numpy())
        
    if not all_embeddings:
        return np.array([])
        
    return np.vstack(all_embeddings)

In [None]:
def calculate_scores_batched(document_text, title, candidates, tokenizer, model, device, BATCH_SIZE=64, pooling_strategy='mean'): 
    # --- 1. Persiapan Teks  ---
    original_text = document_text
    masked_texts = []
    
    for c in candidates:
        phr = c["phrase"]
        n_words = len(phr.split())
        mask_seq = " ".join(["[MASK]"] * n_words)
        try:
            pattern = re.compile(r'\b' + re.escape(phr) + r'\b', re.IGNORECASE)
            masked_text = pattern.sub(mask_seq, original_text)
        except:
            masked_text = original_text.replace(phr, mask_seq)
            
        masked_texts.append(masked_text)

    candidate_phrases = [c['phrase'] for c in candidates]

    # --- 2. Calculate GLOBAL SCORE  ---
    global_scores = {}
    try:
        texts_for_global_score = [original_text] + masked_texts
        all_pooled_embeddings_global = get_pooled_embeddings_batched(
            texts_for_global_score, tokenizer, model, device, BATCH_SIZE, pooling_strategy
        )
        
        if all_pooled_embeddings_global.shape[0] > 0:
            original_embedding = all_pooled_embeddings_global[0:1]
            masked_embeddings = all_pooled_embeddings_global[1:]
            
            if masked_embeddings.shape[0] > 0:
                similarities_global = cosine_similarity(
                    masked_embeddings, 
                    original_embedding
                )
                global_scores = {
                    c['phrase']: (1.0 - sim) 
                    for c, sim in zip(candidates, similarities_global.flatten())
                }
            else:
                global_scores = {c['phrase']: 0 for c in candidates}
        else:
             global_scores = {c['phrase']: 0 for c in candidates}
            
    except Exception as e:
        tqdm.write(f"⚠️ Error in Global Score (CosineSim) calculation: {e}")
        global_scores = {c['phrase']: 0 for c in candidates} # Fallback

    # --- 3. Calculate THEME SCORE ---
    theme_scores = {}
    try:
        texts_for_pooling = [title] + candidate_phrases
        all_pooled_embeddings = get_pooled_embeddings_batched(
            texts_for_pooling, tokenizer, model, device,
            BATCH_SIZE=BATCH_SIZE, pooling_strategy='cls'
        )
    
        if all_pooled_embeddings.shape[0] > 0:
            title_embedding      = all_pooled_embeddings[0:1]
            candidate_embeddings = all_pooled_embeddings[1:]
            if candidate_embeddings.shape[0] > 0:
                similarities_theme = cosine_similarity(candidate_embeddings, title_embedding)
                theme_scores = {c['phrase']: max(0.0, float(sim))
                                for c, sim in zip(candidates, similarities_theme.flatten())}
            else:
                theme_scores = {c['phrase']: 0.0 for c in candidates}
        else:
            theme_scores = {c['phrase']: 0.0 for c in candidates}
    except Exception as e:
        tqdm.write(f"⚠️ Error in Theme Score calculation: {e}")
        theme_scores = {c['phrase']: 0.0 for c in candidates}
    position_scores = {c['phrase']: 1 / (c['position'] + 1) for c in candidates}
    return global_scores, theme_scores, position_scores

✅ Fungsi 'calculate_scores_batched' (Cell 21) telah diperbarui.
   Global Score sekarang menggunakan Mean Pooling + Cosine Similarity.


## 7. Reciprocal Rank Fusion (RRF) Implementation

In [None]:
def reciprocal_rank_fusion(global_scores, theme_scores, position_scores, k=60):
    all_phrases = set(global_scores.keys()) | set(theme_scores.keys()) | set(position_scores.keys())
    global_ranking = sorted(global_scores.items(), key=lambda x: x[1], reverse=True)
    theme_ranking = sorted(theme_scores.items(), key=lambda x: x[1], reverse=True)
    position_ranking = sorted(position_scores.items(), key=lambda x: x[1], reverse=True)
    global_ranks = {phrase: rank + 1 for rank, (phrase, _) in enumerate(global_ranking)}
    theme_ranks = {phrase: rank + 1 for rank, (phrase, _) in enumerate(theme_ranking)}
    position_ranks = {phrase: rank + 1 for rank, (phrase, _) in enumerate(position_ranking)}
    rrf_scores = {}
    for phrase in all_phrases:
        rrf_score = 0
        if phrase in global_ranks:
            rrf_score += 1 / (k + global_ranks[phrase])
        if phrase in theme_ranks:
            rrf_score += 1 / (k + theme_ranks[phrase])
        if phrase in position_ranks:
            rrf_score += 1 / (k + position_ranks[phrase])
        rrf_scores[phrase] = rrf_score
    final_ranking = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    return [phrase for phrase, score in final_ranking]
print("Reciprocal Rank Fusion (RRF) function implemented!")

Reciprocal Rank Fusion (RRF) function implemented!


## 8. Evaluation Function

In [None]:
def process_document(document, tokenizer, model, device):
    POOLING_METHOD = 'mean'
    title = document.get('title', '') or ""
    abstract = document.get('abstract', '') or ""
    keyphrases = document.get('keyphrases', []) or []
    raw_text = '. '.join([t for t in [title, abstract] if t.strip()])
    if not raw_text or not keyphrases:
        return None, None 
    document_text = preprocess_text(raw_text)
    try:
        candidates = extract_candidate_phrases(document_text)

        if not candidates:
            return None, None 
        global_scores, theme_scores, position_scores = calculate_scores_batched(
            document_text, title, candidates, tokenizer, model, device,
            pooling_strategy=POOLING_METHOD
        )
        final_ranking = reciprocal_rank_fusion(global_scores, theme_scores, position_scores, k =40)
        evaluation_results = evaluate(final_ranking, keyphrases)
        top_15_predictions = final_ranking[:15]
        return evaluation_results, top_15_predictions
    except Exception as e:
        tqdm.write(f"⚠️ Error on document: {e}")
        return None, None # Kembalikan dua None

In [None]:


def evaluate(predicted_phrases, true_keyphrases, k_values=[5, 10, 15]):
    # 1. Stemming Ground Truth
    stemmed_true = set([stem_phrase(phrase) for phrase in true_keyphrases if phrase])
    results = {}
    for k in k_values:
        top_k_predicted = predicted_phrases[:k]
        stemmed_predicted = set([stem_phrase(phrase) for phrase in top_k_predicted if phrase])
        true_positives = len(stemmed_predicted & stemmed_true)
        precision = true_positives / len(stemmed_predicted) if len(stemmed_predicted) > 0 else 0
        recall = true_positives / len(stemmed_true) if len(stemmed_true) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        results[f'P@{k}'] = precision
        results[f'R@{k}'] = recall
        results[f'F1@{k}'] = f1
    return results

test_predicted = ['machine learning', 'data analysis', 'neural networks', 'deep learning', 'artificial intelligence']
test_true = ['machine learning', 'data mining', 'neural networks']
test_results = evaluate(test_predicted, test_true, [3, 5])
print("Sample evaluation results:")
for metric, value in test_results.items():
    print(f"  {metric}: {value:.3f}")

Sample evaluation results:
  P@3: 0.667
  R@3: 0.667
  F1@3: 0.667
  P@5: 0.400
  R@5: 0.667
  F1@5: 0.500


## 9. Main Pipeline Implementation

## 10. Full Evaluation Loop

In [None]:
from tqdm.notebook import tqdm

def run_full_evaluation(dataset, tokenizer, model, device, max_documents=None):
    # 1. Inisialisasi penampung skor
    metric_sums = {
        'P@5': 0, 'R@5': 0, 'F1@5': 0,
        'P@10': 0, 'R@10': 0, 'F1@10': 0,
        'P@15': 0, 'R@15': 0, 'F1@15': 0
    }
    
    processed_count = 0
    docs_to_process = dataset.select(range(max_documents)) if max_documents is not None else dataset
    
    # 2. Loop Evaluasi
    for document in tqdm(docs_to_process, desc="Evaluating Documents"):
        results, _ = process_document(document, tokenizer, model, device)
        if results is not None:
            for metric, value in results.items():
                metric_sums[metric] += value
            processed_count += 1
            
    # 3. Hitung Rata-rata
    if processed_count > 0:
        average_metrics = {metric: value / processed_count for metric, value in metric_sums.items()}
    else:
        average_metrics = {metric: 0 for metric in metric_sums.keys()}
        
    return average_metrics, processed_count
TEST_MODE = False  
max_docs = 100 if TEST_MODE else None 

final_results, processed_docs = run_full_evaluation(
    combined_dataset, tokenizer, model, device, max_documents=max_docs
)


for k in [5, 10, 15]:
    p = final_results.get(f'P@{k}', 0)
    r = final_results.get(f'R@{k}', 0)
    f1 = final_results.get(f'F1@{k}', 0)
    print(f"Top-{k:<2} | Precision: {p:.4f} | Recall: {r:.4f} | F1-Score: {f1:.4f}")

Mulai menjalankan evaluasi...


Evaluating Documents:   0%|          | 0/2000 [00:00<?, ?it/s]

## 12. Results Analysis and Visualization

In [None]:
import matplotlib.pyplot as plt
def plot_results(results):
    """
    Create bar plots for evaluation metrics.
    """
    k_values = [5, 10, 15]
    precision_values = [results[f'P@{k}'] for k in k_values]
    recall_values = [results[f'R@{k}'] for k in k_values]
    f1_values = [results[f'F1@{k}'] for k in k_values]
    
    x = range(len(k_values))
    width = 0.25
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    bars1 = ax.bar([i - width for i in x], precision_values, width, label='Precision', alpha=0.8)
    bars2 = ax.bar(x, recall_values, width, label='Recall', alpha=0.8)
    bars3 = ax.bar([i + width for i in x], f1_values, width, label='F1-Score', alpha=0.8)
    
    ax.set_xlabel('K Value')
    ax.set_ylabel('Score')
    ax.set_title('Enhanced MDERank Performance on Inspec Dataset')
    ax.set_xticks(x)
    ax.set_xticklabels([f'K={k}' for k in k_values])
    ax.legend()
    ax.grid(True, alpha=0.3)
    def add_value_labels(bars):
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                   f'{height:.3f}', ha='center', va='bottom', fontsize=9)
    
    add_value_labels(bars1)
    add_value_labels(bars2)
    add_value_labels(bars3)
    
    plt.tight_layout()
    plt.show()
plot_results(final_results)
results_df = pd.DataFrame({
    'K': [5, 10, 15],
    'Precision': [final_results[f'P@{k}'] for k in [5, 10, 15]],
    'Recall': [final_results[f'R@{k}'] for k in [5, 10, 15]],
    'F1-Score': [final_results[f'F1@{k}'] for k in [5, 10, 15]]
})
print("\nResults Summary Table:")
print(results_df.round(4))

## 13. Analisis Kualitatif Hasil Prediksi
#
Di bagian ini, kita akan memuat file `.json` yang berisi hasil prediksi
dan menganalisisnya secara kualitatif.