In [1]:
#Import libraries and initialize the model
import os
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import (
    AlbertConfig,
    AlbertForQuestionAnswering,
    AlbertTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample

from transformers.data.metrics.squad_metrics import compute_predictions_logits

# READER NOTE: Set this flag to use own model, or use pretrained model in the Hugging Face repository
use_own_model = False

if use_own_model:
  model_name_or_path = "/content/model_output"
else:
  model_name_or_path = "ktrapeznikov/albert-xlarge-v2-squad-v2"

output_dir = ""

# Config
n_best_size = 1
max_answer_length = 100
do_lower_case = True
null_score_diff_threshold = 0.0

def to_list(tensor):
    return tensor.detach().cpu().tolist()

# Setup model
config_class, model_class, tokenizer_class = (
    AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
config = config_class.from_pretrained(model_name_or_path)
tokenizer = tokenizer_class.from_pretrained(
    model_name_or_path, do_lower_case=True)
model = model_class.from_pretrained(model_name_or_path, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)


Some weights of the model checkpoint at ktrapeznikov/albert-xlarge-v2-squad-v2 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.weight', 'albert.pooler.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AlbertForQuestionAnswering(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=2048, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=2048, out_features=2048, bias=True)
                (key): Linear(in_features=2048, out_features=2048, bias=True)
                (value): Linear(in_features=2048, out_features=

# Define Function to run prediction 


In [2]:

def run_prediction(question, context_text):
    """Setup function to compute predictions"""
    examples = []

    for i, question_text in enumerate(question):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            is_impossible=False,
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs).values()

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    output_prediction_file = "predictions.json"
    output_nbest_file = "nbest_predictions.json"
    output_null_log_odds_file = "null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )

    return predictions

# Reading Context 

In [3]:
with open('data2.txt', 'r', encoding='utf-8') as file:
        text = file.read()


# Preparing metrics

In [4]:
#model_evaluation.
import warnings
warnings.filterwarnings("ignore")

import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import single_meteor_score
from nltk.translate.meteor_score import single_meteor_score
from nltk.translate.bleu_score import sentence_bleu ,SmoothingFunction
from rouge_score import rouge_scorer
import pandas as pd
def calculate_rouge2_score(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores['rouge2'].fmeasure

def calculate_bleu_score(reference, hypothesis):
    # Tokenize the reference and hypothesis translations
    reference_tokens = nltk.word_tokenize(reference.lower())
    hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())

    # Calculate BLEU score using NLTK's corpus BLEU implementation
    # We use weights=(1, 0, 0, 0) for unigram precision (BLEU-1)
    bleu_score = nltk.translate.bleu_score.sentence_bleu([reference_tokens], hypothesis_tokens, weights=(1, 0, 0, 0))
    
    return bleu_score
def load_existing_results(file_path):
    try:
        existing_df = pd.read_csv(file_path)
        return existing_df
    except FileNotFoundError:
        return pd.DataFrame(columns=["Question", "True Answer", "Predicted Answer", "BLEU Score", "ROUGE-2 Score"])
def save_dataframe_to_csv(dataframe, file_path):
    dataframe.to_csv(file_path, index=False)   

# Runing Prediction

In [5]:
result_file_path = "output2.csv"  # Adjust the file path as needed
import warnings
warnings.filterwarnings("ignore")

existing_results_df = load_existing_results(result_file_path)
while True:
    # Get user input
    user_question = input("Enter your question (or type 'exit' to quit): ")
    
    if user_question.lower() == "exit":
        print("Exiting...")
        break
    
    true_answer = input("Enter the true answer or type 'skip': ")
    
    if true_answer.lower() == "skip":
        # Run prediction
        predictions = run_prediction([user_question], text)
        
        # Get predicted answer
        predicted_answer = list(predictions.values())[0]
        
        # Print only the predicted answer
        print("Predicted Answer:", predicted_answer)
    else:
        # Run prediction
        predictions = run_prediction([user_question], text)
        
        # Get predicted answer and calculate scores
        predicted_answer = list(predictions.values())[0]
        bleu_score = calculate_bleu_score(predicted_answer.capitalize(), true_answer.capitalize())
        rouge2_score = calculate_rouge2_score(predicted_answer.capitalize(), true_answer.capitalize())
        new_data = {
                "Question": user_question.capitalize(),
                "Predicted Answer": predicted_answer.capitalize(),
                "True Answer": true_answer.capitalize(),
                "BLEU Score": bleu_score,
                "ROUGE-2 Score": rouge2_score
            }
            
        existing_results_df = pd.concat([existing_results_df, pd.DataFrame([new_data])], ignore_index=True)
        save_dataframe_to_csv(existing_results_df, result_file_path)
            
        # Print results
        print("Predicted Answer:", predicted_answer)
        print("True Answer:", true_answer)
        print("BLEU Score:", bleu_score)
        print("ROUGE Score:", rouge2_score)



Enter your question (or type 'exit' to quit): exit
Exiting...


# ALBERT FINAL RESULT

In [6]:
df=pd.read_csv("output2.csv")  
df.head(100)

Unnamed: 0,Question,True Answer,Predicted Answer,BLEU Score,ROUGE-2 Score
0,What is etl,A data integration process that combines data ...,,0.000000,0.000000
1,What is sla,A contract between a service provider and its ...,,0.000000,0.000000
2,What does etl stand for,"Extract , transform , load","Extract, transform, and load,",0.670320,0.400000
3,What does sla stand for,Service-level agreement,Service-level agreement,1.000000,1.000000
4,A sla is a contract between who and who,A service provider and its customers,A service provider and its customers,1.000000,1.000000
...,...,...,...,...,...
85,How might the aggregation rule's implementatio...,It could change to be a business rule in advan...,It could change to be a business rule,0.727273,0.823529
86,Where will the unaccounted events dashboard be...,The unaccounted events dashboard will appear u...,In the second position under the accounting ma...,0.375000,0.516129
87,how can the user select multiple entities for...,Using the check-boxes displayed besides the ev...,Using the check-boxes displayed besides the ev...,0.882497,1.000000
88,Who define the accounting period duration,The airline,The accounting period duration is a parameter ...,0.002479,0.142857


# Randomly picking 10 rows from the output


In [7]:
import random
df = pd.read_csv('output2.csv')

# Randomly select five rows
num_samples = 10
random_indices = random.sample(range(len(df)), num_samples)
random_rows = df.iloc[random_indices]

random_rows.head(10)

Unnamed: 0,Question,True Answer,Predicted Answer,BLEU Score,ROUGE-2 Score
38,What is the purpose of manual task management ...,Manual task management methods allow either fo...,Allow either for operators to pick tasks from ...,0.878788,0.928571
19,"When the settlement is considered ""settled""",When it has been successfully matched to one o...,When it has been successfully matched to one o...,0.392857,0.555556
8,What is load test,Load test is the objective is to ensure that l...,Load test is the objective is to ensure that l...,0.96364,1.0
79,What is the primary objective of initiating th...,The primary objective of initiating the disput...,This step aims to facilitate a negotiation or ...,0.583333,0.666667
86,Where will the unaccounted events dashboard be...,The unaccounted events dashboard will appear u...,In the second position under the accounting ma...,0.375,0.516129
85,How might the aggregation rule's implementatio...,It could change to be a business rule in advan...,It could change to be a business rule,0.727273,0.823529
61,How is the settlement of each form of payment ...,Is settled by a specific acquirers,Settled by a specific acquirers,0.833333,0.888889
53,Where will the unaccounted events dashboard be...,Under the name unaccounted events and will be ...,Second position under the accounting main title,0.388889,0.521739
43,What are the components that make up the relat...,"The account, the account name, signing, amount...","The account, the account name, signing, amount...",1.0,1.0
30,What protocol is the file's transmission based...,Sftp,Sftp,1.0,1.0


# Checking duplicated rows

In [8]:
duplicated_rows = df[df.duplicated()]

# Display duplicated rows
print("Duplicated Rows:")
print(duplicated_rows)

Duplicated Rows:
Empty DataFrame
Columns: [Question, True Answer, Predicted Answer, BLEU Score, ROUGE-2 Score]
Index: []


# Creating dataframe for comparison


In [9]:
import pandas as pd

# List of model names and their respective CSV files with pre-calculated scores
model_files = {
    'ALBERT': 'output2.csv',
    'BERT': 'output1.csv',
    'T5': 'output3.csv'
}

# Initialize lists to store data
model_names = []
avg_bleu_scores = []
avg_rouge_scores = []

# Calculate average BLEU and ROUGE scores for each model
for model, file in model_files.items():
    df = pd.read_csv(file)
    
    avg_bleu = df['BLEU Score'].mean()
    avg_rouge = df['ROUGE-2 Score'].mean()
    
    model_names.append(model)
    avg_bleu_scores.append(avg_bleu)
    avg_rouge_scores.append(avg_rouge)

# Create a new DataFrame with the aggregated data
result_df = pd.DataFrame({
    'Model name': model_names,
    'AVG BLEU score': avg_bleu_scores,
    'AVG ROUGE-2 score': avg_rouge_scores
})

# Save the new dataset to a CSV file
result_df.to_csv('model_comparison.csv', index=False)


# Final final result

In [10]:
cmp=pd.read_csv('model_comparison.csv')
cmp.head()

Unnamed: 0,Model name,AVG BLEU score,AVG ROUGE-2 score
0,ALBERT,0.711513,0.777917
1,BERT,0.404006,0.373474
2,T5,0.505599,0.602931


# Final final result using tabulate form


In [12]:
from tabulate import tabulate
#importing the dataframe
cmp_final1=pd.read_csv('model_comparison.csv')
# Convert DataFrame to a formatted table with lines between columns
formatted_table1 = tabulate(cmp_final1, headers='keys', tablefmt='grid')

# Print the formatted table
print(formatted_table1)

+----+--------------+------------------+---------------------+
|    | Model name   |   AVG BLEU score |   AVG ROUGE-2 score |
|  0 | ALBERT       |         0.711513 |            0.777917 |
+----+--------------+------------------+---------------------+
|  1 | BERT         |         0.404006 |            0.373474 |
+----+--------------+------------------+---------------------+
|  2 | T5           |         0.505599 |            0.602931 |
+----+--------------+------------------+---------------------+
