In [None]:
import pandas as pd
from datasets import load_dataset
import requests
from bs4 import BeautifulSoup
import json

#### Dataset Loading and Cleaning

In [None]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower().strip()
        cleaned_text = text.replace('\n', ' ').replace('\r', '').strip()
        return cleaned_text
    elif isinstance(text, list):
        cleaned_texts = []
        for txt in text:
            txt = txt.lower().strip()
            txt = txt.replace('\n', ' ').replace('\r', '').strip()
            cleaned_texts.append(txt)
        return cleaned_texts
    return None

def load_truthfulQA():
    dataset = load_dataset("truthfulqa/truthful_qa", "generation")
    df = pd.DataFrame(dataset['validation'])
    df['question'] = df['question'].apply(clean_text)
    df['source'] = df['source'].apply(clean_text)
    df['best_answer'] = df['best_answer'].apply(clean_text)
    df['correct_answers'] = df['correct_answers'].apply(clean_text)
    df['incorrect_answers'] = df['incorrect_answers'].apply(clean_text)
    return df

def load_HaluEval():
    dataset = load_dataset("pminervini/HaluEval", "qa")
    df = pd.DataFrame(dataset['data'])
    df['knowledge'] = df['knowledge'].apply(clean_text)
    df['question'] = df['question'].apply(clean_text)
    df['answer'] = df['answer'].apply(clean_text)
    df['hallucination'] = df['hallucination'].apply(clean_text)
    return df

def load_fever():
    dataset = load_dataset("fever", "v1.0")
    df = pd.DataFrame(dataset['train'])
    print(df.keys())
    df['id'] = df['id']
    df['claim'] = df['claim'].apply(clean_text)
    df['evidence_id'] = df['evidence_id']
    df['evidence_wiki_url'] = df['evidence_wiki_url'].apply(clean_text)
    df['label'] = df['label'].apply(clean_text)
    df['evidence_sentence_id'] = df['evidence_sentence_id']
    df['evidence_annotation_id'] = df['evidence_annotation_id']
    return df

#### Wiki search for article links/titles

In [None]:
def get_wikipedia_search(url, no_of_links=5):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        html_content = response.text
        
        soup = BeautifulSoup(html_content, 'html.parser')
        result_divs = soup.find_all('div', class_='mw-search-result-heading')
        search_results = []
        
        for div in result_divs:
            
            a_tag = div.find('a')
            if a_tag:
                link = a_tag.get('href', '')
                link.replace(' ', '_')
                if link.startswith('/'):
                    link = f"https://en.wikipedia.org{link}"
                
                title = a_tag.get('title', '')
                
                search_results.append({
                    'link': link,
                    'title': title
                })
        
        return search_results[:no_of_links]
    
    except:
        return "error"

#### Extract Wikipedia content using url

In [None]:
def extract_wikipedia_content(title, url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        content_div = soup.find('div', class_='mw-content-ltr mw-parser-output')
        if not content_div:
            print(f"{title}: Content div not found!")
            return None
        
        content = []
        current_heading = None
        current_topic = None
        break_tags = ['References', 'Sources', 'External links', 'See also', 'Further reading', 'Notes']
        
        # Process content before TOC
        for element in content_div.children:
            if not hasattr(element, 'name'):
                continue
                
            # Stop when we find the TOC meta tag
            if element.name == 'meta' and element.get('property') == 'mw:PageProp/toc':
                break
                
            # Process content before TOC
            if element.name == 'div' and element.get('class'):
                if 'mw-heading2' in element.get('class'):
                    current_heading = element.get_text()
                    content.append(f"\n## {current_heading}\n")
                    current_topic = None
                elif 'mw-heading3' in element.get('class'):
                    current_topic = element.get_text()
                    content.append(f"\n### {current_topic}\n")
                else:
                    text = element.get_text()
                    if text:
                        content.append(text + "\n")
            elif element.name == 'p':
                text = element.get_text()
                if text:
                    content.append(text + "\n")
            elif element.name == 'ul':
                list_items = element.find_all('li')
                for i, li in enumerate(list_items, 1):
                    text = li.get_text()
                    if text:
                        content.append(f"{i}. {text}\n")
        
        # Process content after TOC
        toc_meta = content_div.find('meta', property='mw:PageProp/toc')
        if toc_meta:
            current_element = toc_meta.find_next()
            while current_element:
                if not hasattr(current_element, 'name'):
                    current_element = current_element.find_next()
                    continue
                    
                # Check for break tags
                if (current_element.name == 'div' and 
                    current_element.get('class') and 
                    ('mw-heading2' in current_element.get('class') or
                    'mw-heading3' in current_element.get('class')) and
                    any(current_element.get_text().startswith(break_tag) for break_tag in break_tags)):
                    break
                    
                # Process headings
                if current_element.name == 'div' and current_element.get('class'):
                    if 'mw-heading2' in current_element.get('class'):
                        current_heading = current_element.get_text()
                        content.append(f"\n## {current_heading}\n")
                        current_topic = None
                    elif 'mw-heading3' in current_element.get('class'):
                        current_topic = current_element.get_text()
                        content.append(f"\n### {current_topic}\n")
                    else:
                        text = current_element.get_text()
                        if text:
                            content.append(text + "\n")
                            
                # Process paragraphs
                elif current_element.name == 'p':
                    text = current_element.get_text()
                    if text:
                        content.append(text + "\n")
                        
                # Process lists
                elif current_element.name == 'ul':
                    list_items = current_element.find_all('li')
                    for i, li in enumerate(list_items, 1):
                        text = li.get_text()
                        if text:
                            content.append(f"{i}. {text}\n")
                
                current_element = current_element.find_next()
        
        result = ''.join(content)
        result = result.replace('[edit]', '')
        results = result.split('\n')
        final_results = []
        for line in results:
            if line.strip() == "":
                continue

            if line.startswith('###'):
                line = f"\n{line}"
            elif line.startswith('##'):
                line = f"\n{line}\n"
            elif line.startswith('#'):
                line = f"{line}\n\n"
            if line not in final_results:
                final_results.append(line)
            
        result = '\n'.join(final_results)
        result = "# " + title + "\n\n" + result
        return result
        
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return None

#### Save and Load Json files

In [None]:
def save_dict_to_json(dictionary, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(dictionary, f, indent=4, ensure_ascii=False)
        print(f"Dictionary saved to {filename}")
    except Exception as e:
        print(f"Error saving dictionary: {e}")

def load_dict_from_json(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading dictionary: {e}")
        return None

## Truthful QA

Load dataset

In [None]:
# Used LLM to extract domain for each question and saved in a csv file along with entire dataset
# Load dataset with domains

df = pd.read_csv('truthfulQA_domain.csv')

# Find unique domains
domains = []
for index, row in df.iterrows():
    dm = row['domain']
    domains.append(dm)
    
unique_domains = list(set(domains))

Get wiki articles for domains

In [None]:
domain_pages = {}
for dm in unique_domains:
    url = f"https://en.wikipedia.org/w/index.php?search={'+'.join(dm.split())}&title=Special%3ASearch&profile=advanced&fulltext=1&ns0=1"
    domain_pages[dm] = get_wikipedia_search(url)

Extract wikipedia article content for each domain

In [None]:
for i, page in enumerate(domain_pages, 1):
    for j, article in enumerate(domain_pages[page], 1):
        print(f'Processing: {i}-{j}')
        article['doc'] = extract_wikipedia_content(article['title'], article['link'])
        if article['doc'] == None:
            print(article['title'], ": Error")

Save documents into json

In [None]:
save_dict_to_json(domain_pages, 'truthfulqa_domain_docs.json')

## HaluEval

Load dataset

In [None]:
df = load_HaluEval()

# extract knowledge for each 
halueval_knowledge = []
for index, row in df.iterrows():
    knw = str(row['knowledge']).lower()
    if knw != "nan":
        halueval_knowledge.append({'id': index, 'knowledge': knw})

Save documents into json

In [None]:
save_dict_to_json(halueval_knowledge, 'halueval_knowledge.json')

## Fever

Load dataset

In [None]:
df = load_fever()

# Filter unique evidence wikipedia urls
unique_wiki = set()
for index, row in df.iterrows():
    src = str(row['evidence_wiki_url']).strip()
    if src != '' and src != 'nan':
        unique_wiki.add(src)
unique_wiki = list(unique_wiki)

In [None]:
# Special characters for wikipedia search
special_chars_encoding = {
    ' ': '%20',
    '!': '%21',
    '"': '%22',
    '#': '%23',
    '$': '%24',
    '%': '%25',
    '&': '%26',
    "'": '%27',
    '(': '%28',
    ')': '%29',
    '*': '%2A',
    '+': '%2B',
    ',': '%2C',
    '/': '%2F',
    ':': '%3A',
    ';': '%3B',
    '<': '%3C',
    '=': '%3D',
    '>': '%3E',
    '?': '%3F',
    '@': '%40',
    '[': '%5B',
    '\\': '%5C',
    ']': '%5D',
    '^': '%5E',
    '`': '%60',
    '{': '%7B',
    '|': '%7C',
    '}': '%7D',
    '~': '%7E'
}

Get wiki article for each evidence url

In [None]:
fever_wiki_pages = {}
for article in unique_wiki:
    article_new = ''.join([article])
    for key in special_chars_encoding:
        article_new = article_new.replace(key, special_chars_encoding[key])
    url = f"https://en.wikipedia.org/w/index.php?search={article_new}&title=Special%3ASearch&profile=advanced&fulltext=1&ns0=1"
    fever_wiki_pages[article] = get_wikipedia_search(url, 1)

Extract wikipedia article content for each evidence url

In [None]:
for i, page in enumerate(fever_wiki_pages, 1):
    for j, article in enumerate(fever_wiki_pages[page], 1):
        print(f'Processing: {i}-{j}')
        article['doc'] = extract_wikipedia_content(article['title'], article['link'])
        if article['doc'] == None:
            print(article['title'], ": Error")

Save documents into json

In [None]:
save_dict_to_json(fever_wiki_pages, 'fever_wiki_pages.json')