In [None]:
%pip install transformers
%pip install thefuzz
%pip install fuzzywuzzy
%pip install transformers datasets peft accelerate bitsandbytes
%pip install --upgrade sentence-transformers

# Setup

- https://en.wikipedia.org/wiki/Judo

- https://en.wikipedia.org/wiki/List_of_judo_techniques

- https://en.wikipedia.org/wiki/List_of_judoka

- https://martialarts.fandom.com/wiki/Judo

- https://chas-ma.com/JudoManual/Chapter_2%28HistoryofJudo%29.pdf

- https://www.ijf.org/history

- https://blackbelttrek.com/judo-vs-jiu-jitsu-the-ultimate-comparison/

In [None]:
import os
from transformers import pipeline

# Lade das vortrainierte Modell
model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

data_dir = "data-sources"
context_file = "complete_context.txt"

# Lese alle .txt-Dateien im Verzeichnis data-sources ein
context_parts = []
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(data_dir, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            context_parts.append(f.read())

# Verbinde alle Texte zu einem Gesamttext
context = "\n".join(context_parts)

# Speichere den gesamten Kontext in eine Datei
with open(context_file, "w", encoding="utf-8") as f:
    f.write(context)

# Funktion zum Evaluieren

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained SBERT model (only once)
sbert = SentenceTransformer('all-MiniLM-L6-v2')

def evaluate_qa(data: list,
                context: str,
                nlp_callable,
                threshold: float) -> float:
    correct = 0                     # initialize correct count
    total = len(data)               # total questions

    for item in data:
        question = item["question"] 
        expected = item["answer"]
        QA_input = {"question": question, "context": context}  # model input

        result = nlp_callable(QA_input)   # run QA pipeline
        pred = result["answer"]           # predicted answer

        # Extract answer span from result
        start = result.get("start", None)
        end = result.get("end", None)
        span_text = context[start:end] if start is not None and end is not None else "N/A"

        # --- Semantic similarity via SBERT ---
        emb_pred = sbert.encode(pred, convert_to_tensor=True)      # embed prediction
        emb_exp  = sbert.encode(expected, convert_to_tensor=True)  # embed expected
        sim_score = util.cos_sim(emb_pred, emb_exp).item() * 100   # cosine sim ×100

        status = "✅" if sim_score >= threshold else "❌"  # check threshold
        print(f"{status} Question: {question}")
        print(f"   Expected: {expected}         Received: {pred}")
        print(f"   Span: '{span_text}' (start: {start}, end: {end})\n")

        if sim_score >= threshold:
            correct += 1  # count as correct

    accuracy = (correct / total) * 100 if total else 0   # compute accuracy
    print(f"\n✅ Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")
    return accuracy


In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Ensure context is loaded
if 'context' not in locals():
	with open(context_file, "r", encoding="utf-8") as f:
		context = f.read()
print('Context length (chars): ', len(context))

tokens = tokenizer.tokenize(context)


# Calculate total token length
avg_length = sum(len(token.strip('Ġ')) for token in tokens)/len(tokens)
print(f"Avg Token length: {avg_length:.2f}")
print(f"Total Tokens: {len(tokens)}")

# Basic Pipeline ausführen
### Fragen aus Datensatz auswählen

In [None]:
import json
import random

# Load questions
with open("question-sets/q_v3_rated.json", "r", encoding="utf-8") as f:
    questions_data = json.load(f)
with open("question-sets/q_v2.json", "r", encoding="utf-8") as f:
    all_questions_data = json.load(f)

easy_q = questions_data['easy']
medium_q = questions_data['medium']
hard_q = questions_data['hard']

# Select random questions
random.seed(0)
sampled_easy = random.sample(easy_q, 15)
sampled_medium = random.sample(medium_q, 15)
sampled_hard = random.sample(hard_q, 15)

THRESHOLD = 60

In [None]:
correct = 0
total = len(easy_q)
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
"""
# Run evaluation
for item in easy_q:
    question = item["question"]
    expected_answer = item["answer"]
    QA_input = {
        "question": question,
        "context": context  
    }
"""
ACC_FullContext = evaluate_qa(
    easy_q,  # <- data (list of QA dicts)
    context,            # <- context string
    nlp,                # <- your pipeline/callable
    THRESHOLD           # <- int threshold
)


In [None]:
ACC_FullContext = evaluate_qa(
    medium_q,  # <- data (list of QA dicts)
    context,            # <- context string
    nlp,                # <- your pipeline/callable
    THRESHOLD           # <- int threshold
)

In [None]:
ACC_FullContext = evaluate_qa(
    hard_q,  # <- data (list of QA dicts)
    context,            # <- context string
    nlp,                # <- your pipeline/callable
    THRESHOLD           # <- int threshold
)

# Textcorpus reduzieren
### Korpus in Chunks teilen und nur die top_k relevantesten Chunks im Context behalten -> Schnellere Ausführung

In [None]:
# Semantic Chunk Ranking and Context Reduction Cell

from sentence_transformers import SentenceTransformer
import torch
from torch.nn.functional import cosine_similarity
import json

# 1) Load all evaluation questions
all_questions = easy_q + medium_q + hard_q
print(f"Total questions: {len(all_questions)}")

# 2) Split and filter context into chunks
context_chunks = context.split("\n\n")
print(f"Total chunks: {len(context_chunks)}")

min_words = 20
filtered_chunks = [ch for ch in context_chunks if len(ch.split()) >= min_words]
print(f"Chunks ≥ {min_words} words: {len(filtered_chunks)}")

# 3) Compute semantic embeddings for chunks and questions
embedder = SentenceTransformer("all-MiniLM-L6-v2")
print("Computing chunk embeddings...")
chunk_embeds = embedder.encode(filtered_chunks, convert_to_tensor=True, show_progress_bar=True)

print("Computing question embeddings...")
question_texts = [q["question"] for q in all_questions]
question_embeds = embedder.encode(question_texts, convert_to_tensor=True, show_progress_bar=True)

# 4) Aggregate question embedding (mean pooling)
query_embed = torch.mean(question_embeds, dim=0)

# 5) Compute cosine similarities and rank chunks
sims = cosine_similarity(
    query_embed.unsqueeze(0).repeat(len(chunk_embeds), 1),
    chunk_embeds,
    dim=1
)
#######################
top_k=50
#######################
top_indices = torch.topk(sims, k=top_k).indices.tolist()

print(f"Selected top {top_k} chunks (by semantic relevance):")
for rank, idx in enumerate(top_indices, 1):
    print(f"  {rank}. Chunk #{idx} — Cosine Score: {sims[idx]:.4f}")

# 6) Build reduced context and save
reduced_chunks = [filtered_chunks[i] for i in top_indices]
reduced_context = "\n\n".join(reduced_chunks)
print(f"Reduced context char length: {len(reduced_context)}")
with open("reduced_context.txt", "w", encoding="utf-8") as f:
    f.write(reduced_context)
print("Reduced context saved to 'reduced_context.txt'")

# 7) Report reduction percentage
orig_len = len(context)
new_len = len(reduced_context)
print(f"Context reduced by {(1 - new_len / orig_len) * 100:.2f}%")


# Pipeline mit dem reduzierten Context-file ausführen

In [None]:
with open('reduced_context.txt', "r", encoding="utf-8") as f:
		reduced_context = f.read()

# Run evaluation
for item in sampled_all:
    question = item["question"]
    expected_answer = item["answer"]
    QA_input = {
        "question": question,
        "context": reduced_context 
    }
ACC_ReducedContext = evaluate_qa(
    sampled_all,  # <- data (list of QA dicts)
    reduced_context,            # <- context string
    nlp,                # <- your pipeline/callable
    THRESHOLD           # <- int threshold
)

# Finetuning auf Contextfile/Textkorpus
## Low-Rank Adaption = LoRA als effiziente Finetune-Methode

In [None]:
%pip install "transformers>=4.39.0" "peft==0.15.2"

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

# Load base model and tokenizer
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

# Prepare model for LoRA
model = prepare_model_for_kbit_training(model)

# Add LoRA to the model
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],  # Common for Transformer attention
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"  # Works with MLM too
)
model = get_peft_model(model, peft_config)

# Load your context data
with open("complete_context.txt", "r", encoding="utf-8") as f:
    context_text = f.read()

# Chunk into smaller samples for training
chunk_size = 512
tokens = tokenizer(context_text, return_tensors="pt", truncation=False)["input_ids"][0]
chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size) if len(tokens[i:i+chunk_size]) == chunk_size]

# Prepare dataset
dataset = Dataset.from_dict({"input_ids": chunks})
def tokenize(batch):
    return {"input_ids": batch["input_ids"], "labels": batch["input_ids"]}
dataset = dataset.map(tokenize)

# Data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Training setup
training_args = TrainingArguments(
    output_dir="./lora-roberta-context-finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    save_total_limit=1,
    logging_steps=10,
    save_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train
trainer.train()

# Save LoRA fine-tuned model
model.save_pretrained("lora-roberta-context-finetuned")
tokenizer.save_pretrained("lora-roberta-context-finetuned")

# Testen des Modells nach Finetuning

In [None]:
from transformers import pipeline
nlp = pipeline('question-answering', model='lora-roberta-context-finetuned', tokenizer='lora-roberta-context-finetuned')

# Run evaluation
for item in sampled_questions:
    question = item["question"]
    expected_answer = item["answer"]
    QA_input = {
        "question": question,
        "context": ' '
    }

ACC_FT = evaluate_qa(
    sampled_questions,  # <- data (list of QA dicts)
    context,            # <- context string
    nlp,                # <- your pipeline/callable
    THRESHOLD           # <- int threshold
)

# Vergleich der Performances

In [None]:
import matplotlib.pyplot as plt

# Replace these with your actual accuracy values
ACC_FullContext = ACC_FullContext
ACC_ReducedContext = ACC_ReducedContext
ACC_FT = ACC_FT

labels = ['Full Context', 'Reduced Context', 'Fine-Tuned']
accuracies = [ACC_FullContext, ACC_ReducedContext, ACC_FT]

plt.figure(figsize=(8, 5))
plt.bar(labels, accuracies)
plt.ylabel('Accuracy (%)')
plt.ylim(0, 100)
plt.title('QA Model Performance Comparison')
plt.grid(axis='y', linestyle='--', alpha=0.7)

for i, v in enumerate(accuracies):
    plt.text(i, v + 1, f"{v:.1f}%", ha='center')

plt.tight_layout()
plt.show()

# Finetuning auf Beispiel-QA Fragen (keine Fragen aus Testset)