In [1]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface


env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [2]:
from datasets import load_dataset, Dataset
import datasets
import pandas as pd
import random


In [3]:
# Load dataset
dataset = load_dataset("projecte-aina/catalanqa")
df = dataset['train'].to_pandas()
# Parameters
seed = 42
few_shot_count = 5
random.seed(seed)

# Filter the DataFrame based on the contexts that appear more than few_shot_count times
context_counts = df.groupby('context')['id'].count()
filter_min_few_shot = context_counts > few_shot_count
filtered_df = df[df['context'].isin(context_counts[filter_min_few_shot].index)]

dataset = Dataset.from_pandas(df)

# Tracker for samples
context_samples = {}

# Apply track_samples to dataset
for example in dataset:
    context = example['context']
    if context not in context_samples:
        context_samples[context] = []
    if len(context_samples[context]) < few_shot_count + 1:
        context_samples[context].append(example)

# Now, process the tracked samples
def create_prompt_and_answer(context):
    samples = context_samples[context]
    prompt = f"{samples[0]['context']}" + "\n----\n"
    prompt += "\n----\n".join(
        f"Pregunta: {sample['question']}\nResposta: {sample['answers'][0]['text']}"
        for sample in samples[:-1]
    )
    last = samples[-1]
    prompt += "\n----\n" + f"Pregunta: {last['question']}\nResposta:"
    return {
        'context': context,
        'prompt': prompt,
        'answer': last['answers'][0]['text']
    }

# Create a new dataset from the processed samples
processed_dataset = [create_prompt_and_answer(context) for context in context_samples]

# Convert it back to a Hugging Face dataset for consistency
pd.DataFrame(processed_dataset).drop(columns="context").to_csv("catalanqa.csv", index=False)
