# Baseline model

In [75]:
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')  

# Hyperparameter set based on the answer lengths of the dataset

desired_answer_length = 700 


def remove_stopwords(passage):
    stop_words = set(stopwords.words('english'))
    words = passage.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_passage = " ".join(filtered_words)
    return filtered_passage


def baseline_answer(question, passage, desired_answer_length, remove_stop_words = True):
    
    question = re.sub('[?]', '', question.lower())
    question_words = set(question.split())
    sentences = re.split('[.,!,?]', passage.lower())
    relevant_sentences = []
    
    # Perform score matching between the question and passage
    
    for sentence in sentences:
        if remove_stop_words:
            filtered_question_words = set(remove_stopwords(question).split())
            sentence_score = sum(1 for word in remove_stopwords(sentence).split() if word in filtered_question_words)
        else:   
            sentence_score = sum(1 for word in sentence.split() if word in question_words)
        relevant_sentences.append((sentence_score, len(sentence.split()), sentence))

    relevant_sentences.sort(key = lambda x: x[0], reverse = True)
    cur_answer_length = 0
    ranked_sentences = []
    
    # Select the top sentences based on the number of words you want in the answer 
    
    for position, (_, length, sentence) in enumerate(relevant_sentences):
        if cur_answer_length > desired_answer_length:
            break
        ranked_sentences.append((position, sentence))
        cur_answer_length += length 

    ranked_sentences.sort(key = lambda x: x[0])
    
    # Rank the sentences based on the order in which they appear in the passage
    
    answer = [sentence[1] for sentence in ranked_sentences]
    answer = ". ".join(answer)

    return answer

question = "What is the main idea of the passage?"
passage = "This is a sample passage. It contains several sentences. The main idea of the passage is to explain the importance of NLP. the the the the the the the the the the the the."
answer = baseline_answer(question, passage, desired_answer_length)
print(answer)


    

 the main idea of the passage is to explain the importance of nlp. this is a sample passage.  it contains several sentences.  the the the the the the the the the the the the. 


[nltk_data] Downloading package stopwords to C:\Users\Panini
[nltk_data]     Bhamidipati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Baseline Performance Evaluation

In [76]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [77]:
from pathlib import Path
from bs4 import BeautifulSoup
import requests
from bs4.element import Comment
import urllib.request

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def parse(url):
    body = urllib.request.urlopen(url)
    soup = BeautifulSoup(body, "html.parser")
    texts = soup.findAll(string=True)
    visible_texts = filter(tag_visible, texts)
    return "\p ".join(t.strip() for t in visible_texts if len(t.strip())>0)

In [78]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
import warnings

warnings.filterwarnings('ignore')

xml_file = 'test_data.xml'

output_df = pd.DataFrame(columns=['Question', 'Answer', 'Context'])
qa_dict = {}
tree = ET.parse(xml_file)
root = tree.getroot()
for qa_pair in root.findall("QAPairs")[0]:
  qa_dict['Question'] = qa_pair.find('Question').text
  qa_dict['Answer'] = qa_pair.find('Answer').text
  url = root.attrib['url']
  qa_dict['Context'] = parse(url)
  output_df = output_df.append(qa_dict, ignore_index=True)

In [79]:
output_df

Unnamed: 0,Question,Answer,Context
0,What is (are) Adult Acute Lymphoblastic Leukem...,Key Points\n - Adult acute ...,Skip to main content\p An official website of ...
1,What are the symptoms of Adult Acute Lymphobla...,"Signs and symptoms of adult ALL include fever,...",Skip to main content\p An official website of ...
2,How to diagnose Adult Acute Lymphoblastic Leuk...,Tests that examine the blood and bone marrow a...,Skip to main content\p An official website of ...
3,What is the outlook for Adult Acute Lymphoblas...,Certain factors affect prognosis (chance of re...,Skip to main content\p An official website of ...
4,Who is at risk for Adult Acute Lymphoblastic L...,Previous chemotherapy and exposure to radiatio...,Skip to main content\p An official website of ...
5,What are the stages of Adult Acute Lymphoblast...,Key Points\n - Once adult A...,Skip to main content\p An official website of ...
6,What are the treatments for Adult Acute Lympho...,Key Points\n - There are di...,Skip to main content\p An official website of ...


In [82]:
output_df['Baseline_Answer'] = output_df.apply(lambda row: baseline_answer(row['Question'], row['Context'], desired_answer_length), axis=1)

In [83]:
output_df

Unnamed: 0,Question,Answer,Context,Baseline_Answer
0,What is (are) Adult Acute Lymphoblastic Leukem...,Key Points\n - Adult acute ...,Skip to main content\p An official website of ...,skip to main content\p an official website of ...
1,What are the symptoms of Adult Acute Lymphobla...,"Signs and symptoms of adult ALL include fever,...",Skip to main content\p An official website of ...,skip to main content\p an official website of ...
2,How to diagnose Adult Acute Lymphoblastic Leuk...,Tests that examine the blood and bone marrow a...,Skip to main content\p An official website of ...,skip to main content\p an official website of ...
3,What is the outlook for Adult Acute Lymphoblas...,Certain factors affect prognosis (chance of re...,Skip to main content\p An official website of ...,skip to main content\p an official website of ...
4,Who is at risk for Adult Acute Lymphoblastic L...,Previous chemotherapy and exposure to radiatio...,Skip to main content\p An official website of ...,skip to main content\p an official website of ...
5,What are the stages of Adult Acute Lymphoblast...,Key Points\n - Once adult A...,Skip to main content\p An official website of ...,skip to main content\p an official website of ...
6,What are the treatments for Adult Acute Lympho...,Key Points\n - There are di...,Skip to main content\p An official website of ...,skip to main content\p an official website of ...


In [90]:
length = []
for i in range(len(output_df['Answer'])):
    length.append(len(output_df['Answer'].iloc[i].split()))

In [91]:
length

[436, 127, 445, 66, 114, 468, 1828]

In [92]:
sum(length)/len(length)

497.7142857142857