In [1]:

import requests
import json
import re
import torch
from transformers import BertForQuestionAnswering, BertTokenizer
import os
from sklearn.metrics import f1_score
def get_longest_word(text):
    words = [word for word in text.split() if word.strip()]  # Filter out spaces and empty strings
    return max(words, key=len) if words else ""
def get_second_largest_word(text):
    words = [word for word in text.split() if word.strip()]  # Filter out spaces and empty strings
    sorted_words = sorted(words, key=len, reverse=True)
    return sorted_words[1] if len(sorted_words) >= 2 else ""
def exact_match_score(pred_answer, true_answer):
    pred_answer = pred_answer.lower()
    true_answer = true_answer.lower()

    score = 0.0

    # Check if the length of pred_answer equals the true_answer
    if len(pred_answer) == len(true_answer):
        score += 0.1

    # Check if the first and last letter of pred_answer equal the true_answer
    if pred_answer and pred_answer[0] == true_answer[0]:  #or pred_answer[-1] == true_answer[-1]:
        score += 0.3

    # Check if the longest word in pred_answer and true_answer are equal
    pred_longest_word = get_longest_word(pred_answer)
    true_longest_word = get_longest_word(true_answer)
    if pred_longest_word == true_longest_word:
        score += 0.3

    # Check if pred_answer exactly matches the true_answer
    if pred_answer == true_answer:
        score = 1.0

    return score

def remove_extra_chars(text):
    # Remove '##' characters
    text = text.replace(" ##", "").replace("##", "")
    text = re.sub(r'\s+', ' ', text).strip()  
    return text



def evaluate(question, text, true_answer, model_path=None):
    if model_path:
        os.chdir(model_path)

    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    input_ids = tokenizer.encode(question, text, add_special_tokens=True, truncation=True, max_length=512)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_idx + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0] * num_seg_a + [1] * num_seg_b

    with torch.no_grad():
        outputs = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    start_logits, end_logits = outputs.start_logits, outputs.end_logits
    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits)

    pred_answer = " ".join(tokens[answer_start:answer_end + 1])
    correct_answer=remove_extra_chars(pred_answer)
    em_score = exact_match_score(correct_answer, pred_answer)
    
    return pred_answer, em_score



# Example usage

# Rest of the code...
if __name__ == '__main__':
    question = 'what is ETL'
    with open('data.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    true_answer ="Hot file , amex file"
    model_path = ''  # Adjust the model path if needed
    
    pred_answer, em_score = evaluate(question, text, true_answer, model_path)

    print("\nQuestion:\n{}".format(question.capitalize()))
    print("\nPredicted Answer:\n{}".format(pred_answer.capitalize()))
    print("\nCleaned Answer:\n{}".format(remove_extra_chars(pred_answer.capitalize())))
    print("\nTrue Answer:\n{}".format(true_answer.capitalize()))



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Question:
What is etl

Predicted Answer:
A data integration process that combines data from multiple data sources into a single on , the consistent data store that is loaded into a data warehouse or other target system

Cleaned Answer:
A data integration process that combines data from multiple data sources into a single on , the consistent data store that is loaded into a data warehouse or other target system

True Answer:
Hot file , amex file


# Testing and Preparing metrics 

In [2]:
#model_evaluation
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import single_meteor_score
from nltk.translate.meteor_score import single_meteor_score
from nltk.translate.bleu_score import sentence_bleu ,SmoothingFunction
from rouge_score import rouge_scorer

def get_longest_word(text):
    words = [word for word in text.split() if word.strip()]  # Filter out spaces and empty strings
    return max(words, key=len) if words else ""
def get_second_largest_word(text):
    words = [word for word in text.split() if word.strip()]  # Filter out spaces and empty strings
    sorted_words = sorted(words, key=len, reverse=True)
    return sorted_words[1] if len(sorted_words) >= 2 else ""
def exact_match_score(pred_answer, true_answer):
    pred_answer = pred_answer.lower()
    true_answer = true_answer.lower()

    score = 0.0

    # Check if the length of pred_answer equals the true_answer
    if len(pred_answer) == len(true_answer):
        score += 0.1

    # Check if the first and last letter of pred_answer equal the true_answer
    if pred_answer and pred_answer[0] == true_answer[0]  or pred_answer[-1] == true_answer[-1]:
        score += 0.1

    # Check if the longest word in pred_answer and true_answer are equal
    pred_longest_word = get_longest_word(pred_answer)
    true_longest_word = get_longest_word(true_answer)
    if pred_longest_word == true_longest_word:
        score += 0.25
    pred_longest_second_word = get_second_largest_word(pred_answer)
    true_longest_second_word = get_second_largest_word(true_answer)
    if pred_longest_second_word == true_longest_second_word:
        score += 0.25
    # Check if pred_answer exactly matches the true_answer
    if pred_answer == true_answer:
        score = 1.0

    return score

def calculate_bleu_score(reference, hypothesis):
    # Tokenize the reference and hypothesis translations
    reference_tokens = nltk.word_tokenize(reference.lower())
    hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())

    # Calculate BLEU score using NLTK's corpus BLEU implementation
    # We use weights=(1, 0, 0, 0) for unigram precision (BLEU-1)
    bleu_score = nltk.translate.bleu_score.sentence_bleu([reference_tokens], hypothesis_tokens, weights=(1, 0, 0, 0))
    
    return bleu_score






def compute_f1(predicted_answer, true_answer):
    predicted_tokens = set(predicted_answer.lower().split())
    true_tokens = set(true_answer.lower().split())
    common_tokens = predicted_tokens.intersection(true_tokens)
    precision = len(common_tokens) / (len(predicted_tokens) + 1e-8)
    recall = len(common_tokens) / (len(true_tokens) + 1e-8)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
    return f1






# Example usage

# Rest of the code...
if __name__ == '__main__':
    question = 'what is  example of hot files'
    with open('data.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    true_answer ="Hot file , amex file"
    model_path = ''  # Adjust the model path if needed
    
    pred_answer, em_score = evaluate(question, text, true_answer, model_path)

    print("\nQuestion:\n{}".format(question.capitalize()))
    print("\nPredicted Answer:\n{}".format(pred_answer.capitalize()))
    print("\nCleaned Answer:\n{}".format(remove_extra_chars(pred_answer.capitalize())))
    print("\nTrue Answer:\n{}".format(true_answer.capitalize()))
    

def calculate_rouge2_score(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores['rouge2'].fmeasure








em_score = exact_match_score(remove_extra_chars(pred_answer.capitalize()), true_answer.capitalize())
f1_score = compute_f1(remove_extra_chars(pred_answer.capitalize()), true_answer.capitalize())
bleu_score = calculate_bleu_score(remove_extra_chars(pred_answer.capitalize()), true_answer.capitalize())
rouge2_score = calculate_rouge2_score(remove_extra_chars(pred_answer.capitalize()), true_answer.capitalize())



print(f"Predicted answer: {remove_extra_chars(pred_answer.capitalize())}")
print(f"True answer: {true_answer.capitalize()}")
print(f"Exact Match (EM) score: {em_score}")
print(f"F1 score: {f1_score}")
print(f"BLEU score: {bleu_score}")
print(f"ROUGE-2 score(F1_score): {rouge2_score}")
print("\n")


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Question:
What is  example of hot files

Predicted Answer:
Hot file , am ##ex file

Cleaned Answer:
Hot file , amex file

True Answer:
Hot file , amex file
Predicted answer: Hot file , amex file
True answer: Hot file , amex file
Exact Match (EM) score: 1.0
F1 score: 0.9999999925
BLEU score: 1.0
ROUGE-2 score(F1_score): 1.0




# BERT BASED BOT

In [3]:
# Rest of the code...
import pandas as pd
from transformers import pipeline, logging
import transformers
def get_longest_word(text):
    words = [word for word in text.split() if word.strip()]  # Filter out spaces and empty strings
    return max(words, key=len) if words else ""
def get_second_largest_word(text):
    words = [word for word in text.split() if word.strip()]  # Filter out spaces and empty strings
    sorted_words = sorted(words, key=len, reverse=True)
    return sorted_words[1] if len(sorted_words) >= 2 else ""
def exact_match_score(pred_answer, true_answer):
    pred_answer = pred_answer.lower()
    true_answer = true_answer.lower()

    score = 0.0

    # Check if the length of pred_answer equals the true_answer
    if len(pred_answer) == len(true_answer):
        score += 0.1

    # Check if the first and last letter of pred_answer equal the true_answer
    #if pred_answer and pred_answer[0] == true_answer[0]:  #or pred_answer[-1] == true_answer[-1]:
        #score += 0.3

    # Check if the longest word in pred_answer and true_answer are equal
    pred_longest_word = get_longest_word(pred_answer)
    true_longest_word = get_longest_word(true_answer)
    if pred_longest_word == true_longest_word:
        score += 0.3

    # Check if pred_answer exactly matches the true_answer
    if pred_answer == true_answer:
        score = 1.0

    return score
def save_dataframe_to_csv(dataframe, file_path):
    dataframe.to_csv(file_path, index=False)
def remove_extra_chars(text):
    # Remove '##' characters
    text = text.replace(" ##", "").replace("##", "")
    text = re.sub(r'\s+', ' ', text).strip()  
    return text
def evaluate(question, text, true_answer, model_path=None):
    if model_path:
        os.chdir(model_path)

    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    input_ids = tokenizer.encode(question, text, add_special_tokens=True, truncation=True, max_length=512)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_idx + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0] * num_seg_a + [1] * num_seg_b

    with torch.no_grad():
        outputs = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    start_logits, end_logits = outputs['start_logits'], outputs['end_logits']
    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits)

    # Get the predicted answer from the tokens
    pred_answer = tokenizer.decode(input_ids[answer_start:answer_end + 1], skip_special_tokens=True)

    # Clean the answer and the true answer
    pred_answer = remove_extra_chars(pred_answer)
    correct_answer=remove_extra_chars(pred_answer)

    # Calculate the exact match score
    em_score = exact_match_score(correct_answer, pred_answer)

    return pred_answer, em_score
def evaluate(question, text, true_answer, model_path=None):
    if model_path:
        os.chdir(model_path)

    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    input_ids = tokenizer.encode(question, text, add_special_tokens=True, truncation=True, max_length=512)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_idx + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0] * num_seg_a + [1] * num_seg_b

    with torch.no_grad():
        outputs = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    start_logits, end_logits = outputs['start_logits'], outputs['end_logits']
    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits)

    # Get the predicted answer from the tokens
    pred_answer = tokenizer.decode(input_ids[answer_start:answer_end + 1], skip_special_tokens=True)

    # Clean the answer and the true answer
    pred_answer = remove_extra_chars(pred_answer)
    correct_answer=remove_extra_chars(pred_answer)

    # Calculate the exact match score
    em_score = exact_match_score(correct_answer, pred_answer)

    return pred_answer, em_score
def load_existing_results(file_path):
    try:
        existing_df = pd.read_csv(file_path)
        return existing_df
    except FileNotFoundError:
        return pd.DataFrame(columns=["Question", "True Answer", "Predicted Answer", "BLEU Score", "ROUGE-2 Score"])

def save_dataframe_to_csv(dataframe, file_path):
    dataframe.to_csv(file_path, index=False)

if __name__ == '__main__':
    model_path = ''  # Adjust the model path if needed
    result_file_path = "output1.csv"  # Adjust the file path as needed
    with open('data2.txt', 'r', encoding='utf-8') as file:
        contexts = file.read()
    
    while True:
        question = input("Enter your question (type 'exit' to end): ")

        if question.lower() == 'exit':
            break

        true_answer = input("Enter the true answer for evaluation (type 'skip' to skip evaluation): ")
        if true_answer.lower() == 'skip':
            true_answer = ''
        pred_answer, em_score = evaluate(question, contexts, true_answer, model_path)
        print("\nQuestion:\n{}".format(question.capitalize()))
        print("\nPredicted Answer:\n{}".format(pred_answer.capitalize()))

        if true_answer:
            bleu_score = calculate_bleu_score(remove_extra_chars(pred_answer.capitalize()), true_answer.capitalize())
            rouge2_score = calculate_rouge2_score(remove_extra_chars(pred_answer.capitalize()), true_answer.capitalize())
            print("\nTrue Answer:\n{}".format(true_answer.capitalize()))
            print(f"BLEU score: {bleu_score}")
            print(f"ROUGE-2 score(F1_score): {rouge2_score}")
            
            new_data = {
                "Question": question.capitalize(),
                "Predicted Answer": pred_answer.capitalize(),
                "True Answer": true_answer.capitalize(),
                "BLEU Score": bleu_score,
                "ROUGE-2 Score": rouge2_score
            }
            existing_results_df = load_existing_results(result_file_path)
            existing_results_df = pd.concat([existing_results_df, pd.DataFrame([new_data])], ignore_index=True)
            save_dataframe_to_csv(existing_results_df, result_file_path)
            
            print("Results updated and saved.")


Enter your question (type 'exit' to end): how the categorized disputes are uploaded into the database ?
Enter the true answer for evaluation (type 'skip' to skip evaluation): the categorized disputes are uploaded into the database at specified intervals


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evalua


Question:
How the categorized disputes are uploaded into the database ?

Predicted Answer:
At specified intervals

True Answer:
The categorized disputes are uploaded into the database at specified intervals
BLEU score: 0.2727272727272727
ROUGE-2 score(F1_score): 0.33333333333333337
Results updated and saved.
Enter your question (type 'exit' to end): exit


# BERT FINAL RESULT 

In [4]:
df=pd.read_csv("output1.csv") 
df.head(100)

Unnamed: 0,Question,Predicted Answer,True Answer,BLEU Score,ROUGE-2 Score
0,What is etl,A data integration process that combines data ...,"Extract , transform , load",0.000308,0.0
1,What does etl stand for,"Extract, transform, and load","Extract , transform , load",0.57893,0.4
2,What is sla,Service - level agreement ( sla ) is a contrac...,S a contract between a service provider and it...,0.76406,0.760563
3,How the categorized disputes are uploaded into...,At specified intervals,The categorized disputes are uploaded into the...,0.272727,0.333333


# Randomly picking 10 rows from the output

In [5]:
import random
df = pd.read_csv('output3.csv')

# Randomly select five rows
num_samples = 10
random_indices = random.sample(range(len(df)), num_samples)
random_rows = df.iloc[random_indices]

random_rows.head(10)


Unnamed: 0,Question,True Answer,Predicted Answer,BLEU Score,ROUGE-2 Score
21,"When do the status ""pending"" apply",When no settlement has been linked to a sale,When no settlement has been linked to a sale,1.0,1.0
13,What is unit test in a clean architecture,"In clean architecture, a unit test is a type o...",A unit test is a type of automated test that i...,0.615385,0.810811
22,What happened when when the capture status of ...,"The ""n/a"" (not applicable) status is assigned","""n/a"" status is assigned",0.545455,0.545455
9,What format should the file name respect,Ax_merchant_number_epa_amex_$utc_date$_utc_time,Ax_merchant_number_epa_amex_$utc,0.4,0.769231
11,What is scaling plan,A scaling plan is a set of rules and costs ass...,A scaling plan is a set of rules and costs ass...,0.319149,0.491228
14,Example of naming convention,Salesreport_{airlinecode}{salesreportid}{recep...,Salesreport_airlinecodesalesreportidreceptiondate,0.0,0.0
7,What is load test,Load test is the objective is to ensure that l...,Load test is the objective is to ensure that l...,0.481481,0.631579
1,What is sla,Is a contract between a service provider and i...,Service-level agreement (sla) is a contract be...,0.3,0.4
23,What does soc stand for,Summary of charge,Summary of charge,1.0,1.0
18,What is the purpose of matching rules module,The matching rules module enables the user to ...,Matching rules module enables the user to chan...,0.888889,0.967742
