<a href="https://colab.research.google.com/github/baraki-weldat/Cypher-Generation/blob/main/Code_LLama_2_Final_Fine_Tuning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Necessary Tools and Packages


In [1]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl datasets
%pip install nltk rouge-score

# Import Implementation Packages


In [2]:
# Import the most important packages
import pandas as pd
import os
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import  BitsAndBytesConfig, TrainingArguments
from transformers import  Trainer

from peft import get_peft_model
from peft import LoraConfig
import bitsandbytes as bnb

# Import the Evaluation Metrics
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
# Dataset spliting and formatting libraries
from sklearn.model_selection import train_test_split
from datasets import Dataset
import string
from google.colab import drive
from sklearn.model_selection import train_test_split
from trl import SFTTrainer

# Data Preparation and Cleaning

In [4]:
drive.mount("/content/drive", force_remount=True)
# Import the datasets
EvalCompanies = pd.read_excel("/content/drive/MyDrive/R and D from ABE/Raw Datasets/Companies Dataset.xlsx")
EvalMovies = pd.read_excel("/content/drive/MyDrive/R and D from ABE/Raw Datasets/Movies Dataset.xlsx")
EvalNetwork = pd.read_excel("/content/drive/MyDrive/R and D from ABE/Raw Datasets/Network Datasets.xlsx")

Mounted at /content/drive


In [5]:
# Drop unnecessary Columns
print(EvalCompanies.columns, EvalNetwork.columns, EvalMovies.columns)
EvalCompanies = EvalCompanies[['Natural Language Question', 'Cypher Query']]
EvalMovies = EvalMovies[['question', 'cypher']]
EvalNetwork = EvalNetwork[['Natural Language Question', 'Cypher Query']]
print(EvalCompanies.columns, EvalNetwork.columns, EvalMovies.columns)
# Rename Column names of the movies
EvalMovies= EvalMovies.rename(columns={'question': 'Natural Language Question', 'cypher': 'Cypher Query'})
print("The number of Evaluation datasets")
EvalCompanies.count(),EvalMovies.count(), EvalNetwork.count()
print("Total Evaluation datasets are:", len(EvalCompanies),len(EvalMovies),len(EvalNetwork))


Index(['Natural Language Question', 'Cypher Query', 'Unnamed: 2'], dtype='object') Index(['Natural Language Question', 'Cypher Query'], dtype='object') Index(['question', 'cypher', 'validated_cypher', 'vote', 'has_answer',
       'database', 'database.1', 'Status'],
      dtype='object')
Index(['Natural Language Question', 'Cypher Query'], dtype='object') Index(['Natural Language Question', 'Cypher Query'], dtype='object') Index(['question', 'cypher'], dtype='object')
The number of Evaluation datasets
Total Evaluation datasets are: 139 265 69


## Import the Synthetic datasets generated by GPT-4 Turbo  


In [6]:
# Import the Synthetic datasets
SyntheticDatasets = pd.read_csv("/content/drive/MyDrive/R and D from ABE/text2cypher-main/datasets/synthetic_gpt4turbo_demodbs/text2cypher_gpt4turbo.csv")

In [7]:
# Filter the required datasets
SyntheticCompanies = SyntheticDatasets[SyntheticDatasets["database"] == 'companies']
SyntheticMovies = SyntheticDatasets[SyntheticDatasets["database"] == 'movies']
SyntheticNetwork = SyntheticDatasets[SyntheticDatasets["database"] == 'network']

# Reset the index with dropping the values
SyntheticCompanies.reset_index(drop=True)
SyntheticMovies.reset_index(drop=True)
SyntheticNetwork.reset_index(drop=True)

# Drop unneccessary Columns from the  datasets
SyntheticCompanies.drop(columns=["type", "database",	"syntax_error", "timeout", "returns_results", "false_schema"],axis=1, inplace= True)
SyntheticMovies.drop(columns=["type", "database",	"syntax_error", "timeout", "returns_results", "false_schema"],axis=1, inplace= True)
SyntheticNetwork.drop(columns=["type", "database",	"syntax_error", "timeout", "returns_results", "false_schema"],axis=1, inplace= True)

# The columns of the datasets
SyntheticCompanies.columns, SyntheticMovies.columns, SyntheticNetwork.columns

# Rename Columns
SyntheticCompanies.columns= ["Natural_Language_Question", "Cypher_Query"]
SyntheticMovies.columns  = ["Natural_Language_Question", "Cypher_Query"]
SyntheticNetwork.columns = ["Natural_Language_Question", "Cypher_Query"]

# The number of entries of each of the datasets
len(SyntheticCompanies)+len(SyntheticMovies)+len(SyntheticNetwork)
# Drop entries with null values
EvalNetwork = EvalNetwork.dropna()
EvalMovies = EvalMovies.dropna()
EvalCompanies = EvalCompanies.dropna()
EvalNetwork.isnull().sum(), EvalMovies.isnull().sum(),EvalCompanies.isnull().sum()

# Rename Columns for Conformity
EvalMovies.columns =  ["Natural_Language_Question", "Cypher_Query"]
EvalCompanies.columns= ["Natural_Language_Question", "Cypher_Query"]
EvalNetwork.columns = ["Natural_Language_Question", "Cypher_Query"]
# Data Preprocessing
def Preprocess(text):
    PreProcessedText = text.lower().translate(str.maketrans('', '', string.punctuation))    # Convert to lowercase

    return PreProcessedText
# Preprocess the evaluation datasets
EvalMovies["Natural_Language_Question"]  = EvalMovies["Natural_Language_Question"].apply(Preprocess)
EvalCompanies["Natural_Language_Question"]  = EvalCompanies["Natural_Language_Question"].apply(Preprocess)
EvalNetwork["Natural_Language_Question"]  = EvalNetwork["Natural_Language_Question"].apply(Preprocess)

# Preprocessing the training datasets
SyntheticCompanies["Natural_Language_Question"] = SyntheticCompanies["Natural_Language_Question"].apply(Preprocess)
SyntheticMovies["Natural_Language_Question"]  =SyntheticMovies["Natural_Language_Question"].apply(Preprocess)
SyntheticNetwork["Natural_Language_Question"]  = SyntheticNetwork["Natural_Language_Question"].apply(Preprocess)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SyntheticCompanies.drop(columns=["type", "database",	"syntax_error", "timeout", "returns_results", "false_schema"],axis=1, inplace= True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SyntheticMovies.drop(columns=["type", "database",	"syntax_error", "timeout", "returns_results", "false_schema"],axis=1, inplace= True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SyntheticNetwork.drop(columns=["type", "database",	"syntax_error", "timeout", "returns_results", "false

In [14]:
# Split Validation and Training datasets
SyntheticMoviesTrain, SyntheticMoviesValid = train_test_split( SyntheticMovies,test_size=0.05, random_state=42)
SyntheticNetworkTrain, SyntheticNetworkValid = train_test_split( SyntheticNetwork,test_size=0.05, random_state=42)
SyntheticCompaniesTrain, SyntheticCompaniesValid = train_test_split( SyntheticCompanies,test_size=0.05, random_state=42)

# 1. Code LLama on Movies Datasets

In [9]:
# Model Configuration
base_model = "codellama/CodeLlama-7b-hf"
new_model = "code-llama-7b-Moviesfinetuned"

# Load dataset
train_dataset = Dataset.from_pandas(SyntheticMoviesTrain)
test_dataset = Dataset.from_pandas(SyntheticMoviesValid)
Eval_dataset = Dataset.from_pandas(EvalMovies)

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Preprocessing function to tokenize the input data
def preprocess_function(examples):
    inputs = examples["Natural_Language_Question"]
    targets = examples["Cypher_Query"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=512, truncation=True, padding="max_length")["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

# Tokenize the dataset
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)
Eval_dataset   = Eval_dataset.map(preprocess_function, batched=True)
# 4-bit quantization configuration
# dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Loading Code Llama model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Defining LoRA parameters (Adapter Layer )
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
)

# Attach trainable adapters
model = get_peft_model(model, peft_params)

# Training Parameters
training_params = TrainingArguments(
    output_dir="./CodeLlama2Movies",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_steps=10,
    optim="paged_adamw_32bit",
    logging_strategy="steps",
    save_steps=25,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="tensorboard"
)

# Model fine-tuning
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_params,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_params,
    packing= False,
)

trainer.train()

# Save the model
trainer.save_model(new_model)
tokenizer.save_pretrained(new_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Map:   0%|          | 0/728 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



Step,Training Loss,Validation Loss
73,1.8615,2.055156
146,2.7852,1.856403
219,1.6081,1.743119




Step,Training Loss,Validation Loss
73,1.8615,2.055156
146,2.7852,1.856403
219,1.6081,1.743119
292,1.6195,1.684338




('code-llama-7b-Moviesfinetuned/tokenizer_config.json',
 'code-llama-7b-Moviesfinetuned/special_tokens_map.json',
 'code-llama-7b-Moviesfinetuned/tokenizer.model',
 'code-llama-7b-Moviesfinetuned/added_tokens.json',
 'code-llama-7b-Moviesfinetuned/tokenizer.json')

In [None]:
# Evaluation on test dataset
def generate_predictions(model, tokenizer, test_data):
    predictions = []
    references = []
    for example in test_data:
        question = example["Natural_Language_Question"]
        reference = example["Cypher_Query"]
        inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
        outputs = model.generate(**inputs, max_length=512, pad_token_id=tokenizer.eos_token_id)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append(reference)
    return predictions, references

# Prepare the test data for generation
Eval_dataset = Eval_dataset
predictions, references = generate_predictions(model, tokenizer, Eval_dataset)

# Calculate BLEU score
bleu_score = corpus_bleu([[ref] for ref in references], predictions)
print(f"BLEU score: {bleu_score:.4f}")

# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rougeL_scores = []
for ref, pred in zip(references, predictions):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)
print(f"ROUGE-1 score: {average_rouge1:.4f}")
print(f"ROUGE-L score: {average_rougeL:.4f}")

# Save the predictions and references
results = pd.DataFrame({
    'Question': [example["Natural_Language_Question"] for example in Eval_dataset],
    'Reference': references,
    'Prediction': predictions
})

# Export the dataframe to a CSV file
results.to_csv('predictions_references.csv', index=False)

# 2. Fine Tuning the Companies Dataset



In [15]:
# Model Configuration
base_model = "codellama/CodeLlama-7b-hf"
new_model = "code-llama-7b-Companiesfinetuned"

# Load dataset
train_dataset = Dataset.from_pandas(SyntheticCompaniesTrain)
test_dataset = Dataset.from_pandas(SyntheticCompaniesValid)
Eval_dataset = Dataset.from_pandas(EvalCompanies)

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Preprocessing function to tokenize the input data
def preprocess_function(examples):
    inputs = examples["Natural_Language_Question"]
    targets = examples["Cypher_Query"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=512, truncation=True, padding="max_length")["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

# Tokenize the dataset
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)
Eval_dataset   = Eval_dataset.map(preprocess_function, batched=True)
# 4-bit quantization configuration
# dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Loading Code Llama model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Defining LoRA parameters (Adapter Layer )
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
)

# Attach trainable adapters
model = get_peft_model(model, peft_params)

# Training Parameters
training_params = TrainingArguments(
    output_dir="./CodeLlama2Companies",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_steps=10,
    optim="paged_adamw_32bit",
    logging_strategy="steps",
    save_steps=25,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="tensorboard"
)

# Model fine-tuning
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_params,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_params,
    packing= False,
)

trainer.train()

# Save the model
trainer.save_model(new_model)
tokenizer.save_pretrained(new_model)

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Step,Training Loss,Validation Loss
95,1.9061,2.02306
190,2.4081,1.771935
285,1.4053,1.60061
380,1.2968,1.486978
475,0.995,1.402317




('code-llama-7b-Companiesfinetuned/tokenizer_config.json',
 'code-llama-7b-Companiesfinetuned/special_tokens_map.json',
 'code-llama-7b-Companiesfinetuned/tokenizer.model',
 'code-llama-7b-Companiesfinetuned/added_tokens.json',
 'code-llama-7b-Companiesfinetuned/tokenizer.json')

In [None]:
# Evaluation on test dataset
def generate_predictions(model, tokenizer, test_data):
    predictions = []
    references = []
    for example in test_data:
        question = example["Natural_Language_Question"]
        reference = example["Cypher_Query"]
        inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
        outputs = model.generate(**inputs, max_length=512, pad_token_id=tokenizer.eos_token_id)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append(reference)
    return predictions, references

# Prepare the test data for generation
Eval_dataset = Eval_dataset
predictions, references = generate_predictions(model, tokenizer, Eval_dataset)

# Calculate BLEU score
bleu_score = corpus_bleu([[ref] for ref in references], predictions)
print(f"BLEU score: {bleu_score:.4f}")

# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rougeL_scores = []
for ref, pred in zip(references, predictions):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)
print(f"ROUGE-1 score: {average_rouge1:.4f}")
print(f"ROUGE-L score: {average_rougeL:.4f}")

# Save the predictions and references
results = pd.DataFrame({
    'Question': [example["Natural_Language_Question"] for example in Eval_dataset],
    'Reference': references,
    'Prediction': predictions
})

# Export the dataframe to a CSV file
results.to_csv('predictions_references.csv', index=False)

# 3. Networks Dataset

In [None]:
# Model Configuration
base_model = "codellama/CodeLlama-7b-hf"
new_model = "code-llama-7b-Networkfinetuned"

# Load dataset
train_dataset = Dataset.from_pandas(SyntheticNetworkTrain)
test_dataset = Dataset.from_pandas(SyntheticNetworkValid)
Eval_dataset = Dataset.from_pandas(EvalNetwork)

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Preprocessing function to tokenize the input data
def preprocess_function(examples):
    inputs = examples["Natural_Language_Question"]
    targets = examples["Cypher_Query"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=512, truncation=True, padding="max_length")["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

# Tokenize the dataset
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)
Eval_dataset   = Eval_dataset.map(preprocess_function, batched=True)
# 4-bit quantization configuration
# dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Loading Code Llama model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Defining LoRA parameters (Adapter Layer )
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
)

# Attach trainable adapters
model = get_peft_model(model, peft_params)

# Training Parameters
training_params = TrainingArguments(
    output_dir="./CodeLlama2Network",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_steps=10,
    optim="paged_adamw_32bit",
    logging_strategy="steps",
    save_steps=25,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="tensorboard"
)

# Model fine-tuning
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_params,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_params,
    packing= False,
)

trainer.train()

# Save the model
trainer.save_model(new_model)
tokenizer.save_pretrained(new_model)

In [None]:
# Evaluation on test dataset
def generate_predictions(model, tokenizer, test_data):
    predictions = []
    references = []
    for example in test_data:
        question = example["Natural_Language_Question"]
        reference = example["Cypher_Query"]
        inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
        outputs = model.generate(**inputs, max_length=512, pad_token_id=tokenizer.eos_token_id)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append(reference)
    return predictions, references

# Prepare the test data for generation
Eval_dataset = Eval_dataset
predictions, references = generate_predictions(model, tokenizer, Eval_dataset)

# Calculate BLEU score
bleu_score = corpus_bleu([[ref] for ref in references], predictions)
print(f"BLEU score: {bleu_score:.4f}")

# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rougeL_scores = []
for ref, pred in zip(references, predictions):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)
print(f"ROUGE-1 score: {average_rouge1:.4f}")
print(f"ROUGE-L score: {average_rougeL:.4f}")

# Save the predictions and references
results = pd.DataFrame({
    'Question': [example["Natural_Language_Question"] for example in Eval_dataset],
    'Reference': references,
    'Prediction': predictions
})

# Export the dataframe to a CSV file
results.to_csv('predictions_references.csv', index=False)