In [6]:
!pip install datasets
!pip install penman
!wget -O amr3.0.tar.gz https://amr.isi.edu/download/amr-bank-3.0.txt
!mkdir amr_data
!tar -xvzf amr3.0.tar.gz -C amr_data
!pip install transformers datasets torch

  pid, fd = os.forkpty()


--2024-12-08 11:35:41--  https://amr.isi.edu/download/amr-bank-3.0.txt
Resolving amr.isi.edu (amr.isi.edu)... failed: Name or service not known.
wget: unable to resolve host address 'amr.isi.edu'
mkdir: cannot create directory 'amr_data': File exists

gzip: stdin: unexpected end of file
tar: Child returned status 1
tar: Error is not recoverable: exiting now


### AMR dataset

In [7]:
from datasets import load_dataset

# Load AMR dataset
dataset = load_dataset("tverous/anli-amr", split="train")

# View the data
print(dataset[0])
print(dataset.shape)

{'uid': '2093cfb3-a15f-4282-81e3-0cb793ffd0d7', 'premise': 'TOKYO, Dec 18 (Reuters) - Japan’s Shionogi & Co said on Tuesday that it has applied to health regulators in the United States, Canada and Europe for approval of its HIV drug Dolutegravir. Shionogi developed Dolutegravir with a Viiv Healthcare, an AIDS drug joint venture between GlaxoSmithKline and Pfizer, in exchange for its rights to the drug.', 'hypothesis': 'The article was written on December 18th.', 'label': 0, 'reason': 'TOKYO, Dec 18 (Reuters) is when the article was written as it states in the first words of the sentence', 'claim_cleaned_amr': '( z0 write :ARG1 ( z1 article ) :time ( z2 date-entity :day 18 :month 12 ) )', 'amr_penman': '(z0 / write-01\n    :ARG1 (z1 / article)\n    :time (z2 / date-entity\n              :day 18\n              :month 12))', 'amr_tokens': ['The', 'article', 'was', 'written', 'on', 'December', '18th', '.'], 'amr_nodes': "{'z1': 'article', 'z0': 'write-01', 'z2': 'date-entity', '0': '12', 

In [8]:
from datasets import load_dataset
import pandas as pd



# Function to extract AMR graph and text
def extract_amr_and_text(data):
    amr_text_pairs = []
    for row in data:
        amr_graph = row.get("amr_penman", None)
        text = row.get("hypothesis", None)
        if amr_graph and text:
            amr_text_pairs.append({"amr_graph": amr_graph, "text": text})
    return amr_text_pairs

# Extract AMR graphs and texts for all rows
amr_text_pairs = extract_amr_and_text(dataset)

amrs = []
texts = []

for i in range(100459):
    amrs.append(amr_text_pairs[i]['amr_graph'])
    texts.append(amr_text_pairs[i]['text'])

# Creating DataFrame with 'amr_graph' and 'text' columns
data_amr = pd.DataFrame({
    'amr_graph': amrs,
    'text': texts
})

data_amr.head()  # Displaying the first few rows to verify

Unnamed: 0,amr_graph,text
0,(z0 / write-01\n :ARG1 (z1 / article)\n ...,The article was written on December 18th.
1,(z0 / urge-01\n :ARG0 (z1 / person\n ...,Gillum was on TV urging residents to stay out ...
2,(z0 / and\n :op1 (z1 / beat-03\n ...,Carlton beat Melbourne in 2016 and will attemp...
3,(z0 / close-01\n :ARG1 (z1 / road)\n :du...,The road was closed for more than two hours af...
4,(z0 / advise-01\n :ARG2 (z1 / slow-down-03)),Its advisible to slow down


### Transformer Finetuning

In [9]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(data_amr)

# Select a smaller subset if needed
small_dataset = dataset.select([i for i in range(25000)])

# Define prompt and answer templates
prompt_template = """Translate from Graph to Text- Instruction: {instruction}\n Response:"""
answer_template = """{response}"""

# Define function to add keys in the dictionary for prompt, answer, and combined text
def _add_text(rec):
    instruction = rec["amr_graph"]  # Use amr_graph as instruction
    response = rec["text"]  # Use text as response
    
    # Check if both exist; raise error if not
    if not instruction:
        raise ValueError(f"Expected an instruction (amr_graph) in: {rec}")
    if not response:
        raise ValueError(f"Expected a response (text) in: {rec}")
    
    # Create prompt, answer, and combined text
    rec["prompt"] = prompt_template.format(instruction=instruction)
    rec["answer"] = answer_template.format(response=response)
    rec["text"] = rec["prompt"] + rec["answer"]
    return rec

# Apply the function to the dataset
small_dataset = small_dataset.map(_add_text)

# Print the first item to check
print(small_dataset[0])


Using device: cuda


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

{'amr_graph': '(z0 / write-01\n    :ARG1 (z1 / article)\n    :time (z2 / date-entity\n              :day 18\n              :month 12))', 'text': 'Translate from Graph to Text- Instruction: (z0 / write-01\n    :ARG1 (z1 / article)\n    :time (z2 / date-entity\n              :day 18\n              :month 12))\n Response:The article was written on December 18th.', 'prompt': 'Translate from Graph to Text- Instruction: (z0 / write-01\n    :ARG1 (z1 / article)\n    :time (z2 / date-entity\n              :day 18\n              :month 12))\n Response:', 'answer': 'The article was written on December 18th.'}


In [11]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from typing import Dict, List
from functools import partial
import copy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from datasets import DatasetDict


model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

MAX_LENGTH = 256

# Function to generate token embeddings from the text part of the batch
def _preprocess_batch(batch: Dict[str, List]):  
    model_inputs = tokenizer(batch["text"], max_length=MAX_LENGTH, truncation=True, padding='max_length')    
    model_inputs["labels"] = copy.deepcopy(model_inputs['input_ids'])
    return model_inputs

_preprocessing_function = partial(_preprocess_batch)



# Define the split ratios
train_test_split = small_dataset.train_test_split(test_size=0.2)  # Split off 20% as test set
train_valid_split = train_test_split['train'].train_test_split(test_size=0.1)  # From train, split 10% as validation

# Combine splits into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': train_test_split['test']
})

# Print the size of each split to verify
print(f"Train set size: {len(dataset_dict['train'])}")
print(f"Validation set size: {len(dataset_dict['validation'])}")
print(f"Test set size: {len(dataset_dict['test'])}")

# Example check for first item in each split
print("Sample from train:", dataset_dict['train'][0])
print("Sample from validation:", dataset_dict['validation'][0])
print("Sample from test:", dataset_dict['test'][0])


# Apply the preprocessing function to each batch in the dataset
encoded_train_dataset = dataset_dict['train'].map(
    _preprocessing_function,
    batched=True,
    remove_columns=["amr_graph", "text", "prompt", "answer"],
)

encoded_validation_dataset = dataset_dict['validation'].map(
    _preprocessing_function,
    batched=True,
    remove_columns=["amr_graph", "text", "prompt", "answer"],
)

encoded_test_dataset = dataset_dict['test'].map(
    _preprocessing_function,
    batched=True,
    remove_columns=["amr_graph", "text", "prompt", "answer"],
)
processed_train_dataset = encoded_train_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)
processed_validation_dataset = encoded_validation_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)
processed_test_dataset = encoded_test_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)







# print(processed_train_dataset)
# print(processed_validation_dataset)
# print(processed_test_dataset)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Train set size: 18000
Validation set size: 2000
Test set size: 5000
Sample from train: {'amr_graph': '(z0 / like-01\n    :ARG0 (z1 / person\n              :name (z2 / name\n                        :op1 "Scott"\n                        :op2 "Lighty"))\n    :ARG1 (z3 / car))', 'text': 'Translate from Graph to Text- Instruction: (z0 / like-01\n    :ARG0 (z1 / person\n              :name (z2 / name\n                        :op1 "Scott"\n                        :op2 "Lighty"))\n    :ARG1 (z3 / car))\n Response:Scott Lighty likes cars.', 'prompt': 'Translate from Graph to Text- Instruction: (z0 / like-01\n    :ARG0 (z1 / person\n              :name (z2 / name\n                        :op1 "Scott"\n                        :op2 "Lighty"))\n    :ARG1 (z3 / car))\n Response:', 'answer': 'Scott Lighty likes cars.'}
Sample from validation: {'amr_graph': '(z0 / have-degree-91\n    :ARG1 (z1 / get-01\n              :ARG1 (z2 / plant\n                        :ARG1-of (z3 / root-02\n                  

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [12]:
train_valid_split['train']

Dataset({
    features: ['amr_graph', 'text', 'prompt', 'answer'],
    num_rows: 18000
})

In [14]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq

# Enable W&B dry run mode
os.environ["WANDB_MODE"] = "dryrun"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer and model for T5
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/mnt/disks/disk1/results',
    evaluation_strategy='epoch',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Accumulate gradients for 4 steps
    warmup_steps=50,
    learning_rate=5e-3,        # Lowered learning rate
    weight_decay=0.01,          # Reduced weight decay to prevent over-penalizing weights
    logging_dir='/mnt/disks/disk1/logs'
)

# Initialize the data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Initialize Trainer with the data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_validation_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the model and tokenizer explicitly
model_output_dir = '/mnt/disks/disk1/results'
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)


Using device: cuda




Epoch,Training Loss,Validation Loss
1,0.0081,0.001092
2,0.004,0.000594
3,0.0017,0.0002
4,0.0006,6.4e-05
5,0.0003,2.6e-05


('/mnt/disks/disk1/results/tokenizer_config.json',
 '/mnt/disks/disk1/results/special_tokens_map.json',
 '/mnt/disks/disk1/results/spiece.model',
 '/mnt/disks/disk1/results/added_tokens.json',
 '/mnt/disks/disk1/results/tokenizer.json')

In [15]:
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

('/mnt/disks/disk1/results/tokenizer_config.json',
 '/mnt/disks/disk1/results/special_tokens_map.json',
 '/mnt/disks/disk1/results/spiece.model',
 '/mnt/disks/disk1/results/added_tokens.json',
 '/mnt/disks/disk1/results/tokenizer.json')

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

def main(input_text):
    # Load the tokenizer and model from the saved directory
    model_path = '/mnt/disks/disk1/results'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    # Calculate the number of parameters in the model being used for inference
    total_params = get_model_parameters(model)
    print(f"Total number of parameters: {total_params}")

    # Prepare the input text for generation
    inputs = tokenizer(input_text, return_tensors='pt')

    # Generate text
    outputs = model.generate(**inputs, max_length=100, num_return_sequences=1)

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Generated text:", generated_text)

# Example input for inference
example_input = "(z0 / dilligent\n:domain (z1 / doctor\n:name (z2 / name\n:op1 'Henry'\n :op2 'Friesen')))"
main(example_input)


Total number of parameters: 60506624
Generated text: Translate from :domain (z1 / doctor :name (z2 / name :op1 'Henry' :op2 'Friesen')))


In [17]:
print(dataset_dict['train'][10]['amr_graph'])
print("________________________________________")
print(dataset_dict['train'][10]['answer'])

(z0 / contain-01
    :ARG0 (z1 / state
              :name (z2 / name
                        :op1 "Texas"))
    :ARG1 (z3 / person
              :ARG1-of (z4 / name-01
                           :ARG2 (z5 / dean))))
________________________________________
Texas contains a person named dean


In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import re

def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

def main(input_text):
    # Load the tokenizer and model from the saved directory
    model_path = '/mnt/disks/disk1/results'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    # Calculate the number of parameters in the model being used for inference
    total_params = get_model_parameters(model)
    #print(f"Total number of parameters: {total_params}")

    # Prepare the input text for generation
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Generate text
    outputs = model.generate(**inputs, max_length=500, num_return_sequences=1)

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print(generated_text)
    # Extract only the response part by splitting based on "Response:"
    match = re.search(r"Response:\s*(.*)", generated_text)
    # print(generated_text)
    if match:
        response_text = match.group(1)
        # Remove extra spaces between sentences
        response_text = re.sub(r'\s{2,}', ' ', response_text)
        # Keep only up to the first sensible sentence-ending punctuation
        response_text = re.split(r'[.!?]', response_text)[0].strip() + '.'
        #print("Response text:", response_text)
        return response_text
    else:
        #print("Response not found in generated text")
        return "Response not found in generated text"

# Example input for inference
example_input = """
translate from Graph to Text: (z0 / group
    :name (z1 / name
              :op1 "Demix")
    :ARG0-of (z2 / experiment-01
                 :polarity -))
"""
output = main(example_input)
print(output)


Response not found in generated text


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu(predicted_text, ground_truth_text):
    # Tokenize the texts into lists of words
    reference = [ground_truth_text.split()]  # BLEU expects a list of references
    hypothesis = predicted_text.split()
    
    # Calculate BLEU score with smoothing
    smoothie = SmoothingFunction().method4  # Use smoothing to handle short texts
    bleu_score = sentence_bleu(reference, hypothesis, smoothing_function=smoothie)
    
    return bleu_score

# Example texts
predicted_text = "The two balls went behind 8 balls"
ground_truth_text = "The 2-ball goes directly behind the 8-ball"

# Calculate and print BLEU score
bleu = calculate_bleu(predicted_text, ground_truth_text)
print(f"BLEU score: {bleu}")


## Average bleu score calculations

In [None]:
import pandas as pd

# Initialize variables for BLEU score calculation
bleu_score = 0
valid_count = 0  # Counter for valid BLEU scores
k = 10  # Define the number of samples to evaluate

# Lists to store generated and ground truth text, and AMR graphs for valid entries
generated_texts = []
ground_truth_texts = []
valid_amr_graphs = []

# List to store indices where BLEU score is invalid or input length > 500
invalid_indices = []

# Loop through the dataset
for i in range(k):
    example_input = dataset_dict['test'][i]['amr_graph']
    ground_truth_text = dataset_dict['test'][i]['answer']
    
    # Tokenize and check input length
    tokenized_input = tokenizer(example_input, return_tensors='pt')
    input_length = tokenized_input['input_ids'].shape[1]
    
    # Skip examples with input length greater than 500
    if input_length > 500:
        invalid_indices.append(i)  # Store the index where input length is greater than 500
        continue

    # Generate model output and calculate BLEU score
    model_output_text = main(example_input)
    bleu = calculate_bleu(model_output_text, ground_truth_text)

    # Store generated and ground truth text only if BLEU score is valid
    if bleu > 0:
        bleu_score += bleu
        valid_count += 1  # Increment count of valid scores
        generated_texts.append(model_output_text)
        ground_truth_texts.append(ground_truth_text)
        valid_amr_graphs.append(example_input)  # Include AMR graph for valid entries
    else:
        invalid_indices.append(i)  # Store the index where BLEU score is invalid (<= 0)

# Calculate the average BLEU score only if there are valid scores
if valid_count > 0:
    avg_bleu_score = bleu_score / valid_count
else:
    avg_bleu_score = 0.0  # Set average to zero if no valid scores were found

# Print the average BLEU score and invalid indices
print("Average BLEU score:", avg_bleu_score)
print("Invalid indices (input length > 500 or BLEU score <= 0):", invalid_indices)

In [None]:

# Create a DataFrame to store the generated, ground truth text, and AMR graphs for valid entries
df = pd.DataFrame({
    'Generated Text': generated_texts,
    'Ground Truth Text': ground_truth_texts,
    'AMR Graph': valid_amr_graphs  # Include the AMR graph only for valid entries
})

# Optionally, save to a CSV file
df.to_csv('generated_vs_ground_truth_with_amr.csv', index=False)

# Print the first few rows of the DataFrame
print(df.head())


In [None]:
df['AMR Graph']

### Finetuned Model

In [None]:
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Path to the fine-tuned model directory
model_path = "/kaggle/input/graph2sequencefinetune-model/checkpoint-3375"

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # Assuming standard GPT-2 tokenizer; change if custom

# Load the model
model = GPT2LMHeadModel.from_pretrained(
    model_path,  # Path to the directory containing the checkpoint
    local_files_only=True  # Load files from the local path
)

# Set the model to evaluation mode
model.eval()

# Text for inference
input_text = """
(z0 / go-01
    :ARG1 (z1 / ball
              :quant 2)
    :ARG1-of (z2 / direct-02)
    :ARG4 (z3 / behind
              :op1 (z4 / ball
                       :quant 8)))
"""

# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate predictions
outputs = model.generate(
    input_ids,
    max_length=256,  # Set the maximum length of the generated text
    num_beams=5,    # Beam search for better quality
    no_repeat_ngram_size=2,  # Avoid repetitive n-grams
    early_stopping=True
)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the response part
match = re.search(r"Response:\s*(.*)", generated_text)
if match:
    response_text = match.group(1)
    # Remove extra spaces between sentences
    response_text = re.sub(r'\s{2,}', ' ', response_text)
    # Keep only up to the first sensible sentence-ending punctuation
    response_text = re.split(r'[.!?]', response_text)[0].strip() + '.'
    print("Generated text:", response_text)
else:
    print("Response not found in generated text.")
