# Load packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# VA datasets

urls = [
    "https://www.va.gov/disability/after-you-file-claim/",
    "https://www.va.gov/change-address/",
    "https://www.va.gov/change-direct-deposit/",
    "https://www.va.gov/claim-or-appeal-status/",
    "https://www.va.gov/decision-reviews/",
    "https://www.va.gov/disability/dependency-indemnity-compensation/",
    "https://www.va.gov/disability/eligibility/hazardous-materials-exposure/",
    "https://www.va.gov/disability/eligibility/illnesses-within-one-year-of-discharge/",
    "https://www.va.gov/disability/eligibility/ptsd/",
    "https://www.va.gov/disability/eligibility/special-claims/",
    "https://www.va.gov/disability/eligibility/",
    "https://www.va.gov/disability/file-disability-claim-form-21-526ez/",
    "https://www.va.gov/disability/how-to-file-claim/additional-forms/",
    "https://www.va.gov/disability/how-to-file-claim/evidence-needed/fully-developed-claims/",
    "https://www.va.gov/disability/how-to-file-claim/when-to-file/",
    "https://www.va.gov/disability/how-to-file-claim/evidence-needed/",
    "https://www.va.gov/disability/how-to-file-claim/",
    "https://www.va.gov/disability/upload-supporting-evidence/",
    "https://www.va.gov/va-payment-history/",
    "https://www.va.gov/disability/view-disability-rating/",
    "https://www.va.gov/disability/how-to-file-claim/additional-forms/"
]

In [None]:
# Save the VA websites as HTML
def save_url_to_html(url, output_dir='./dataset/va_html'):
    response = requests.get(url)
    if response.status_code == 200:
        # Extract filename
        filename = url.split("www.va.gov/")[1].strip('/').replace('/', '_').replace('-', '_') + '.html'
        filepath = f"{output_dir}/{filename}"

        # Save the HTML
        with open(filepath, 'w', encoding='utf-8') as file:
            file.write(response.text)

        print(f"Saved {url} to {filepath}")
    else:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")


for url in urls:
    save_url_to_html(url)

### Analyse data and cleaning

In [None]:


data = []

# extract text content
def extract_text(element):
    text = ''
    for child in element.descendants:
        if child.name in ['p', 'li', 'strong', 'span', 'h3', 'va-button', 'span']:
            text += child.get_text(strip=True) + ' '
    return text.strip()

# clean context from URL
def extract_context_from_url(url):
    parts = url.split("www.va.gov/")[1].strip('/').split('/')
    return ' '.join(parts).replace('-', ' ')

# Process each URL
for url in urls:
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        article = soup.select_one('article.usa-content.vads-u-padding-bottom--0')

        if article:
            # Extract the title and intro
            page_title = article.find('h1').text.strip()
            intro_div = article.find('div', class_='va-introtext')
            intro_text = ' '.join(intro_div.stripped_strings) if intro_div else ''

            url_context = extract_context_from_url(url)

            data.append({
                'source_url': url,
                'title': f"{page_title}. {url_context}",
                'content': intro_text,
                'index': len(data)
            })

            # section within the article
            index = len(data)
            for section in article.find_all('div', recursive=False):
                if section.get('data-template') == 'paragraphs/q_a_section':
                    for q_a in section.find_all('div', {'data-template': 'paragraphs/q_a'}):
                        question = q_a.get('data-analytics-faq-text', '').strip()
                        answer_parts = []

                        # the same data-entity-id
                        entity_id = q_a.get('data-entity-id')
                        if entity_id:
                            entity_div = section.find('div', {'data-entity-id': entity_id})
                            if entity_div:
                                answer_parts.append(extract_text(entity_div))

                        answer = ' '.join(answer_parts).strip()
                        data.append({
                            'source_url': url,
                            'title': f"{question}. {url_context}",
                            'content': answer,
                            'index': index
                        })
                        index += 1
                elif section.get('data-template') == 'paragraphs/wysiwyg':
                    wysiwyg_section_title = section.find('h3').text.strip() if section.find('h3') else ''
                    wysiwyg_content = extract_text(section)
                    data.append({
                        'source_url': url,
                        'title': f"{wysiwyg_section_title}. {url_context}",
                        'content': wysiwyg_content,
                        'index': index
                    })
                    index += 1
                elif section.get('data-template') == 'paragraphs/process':
                    process_section_title = section.find('h3').text.strip() if section.find('h3') else ''
                    process_content = ''
                    for step in section.find_all('va-process-list-item'):
                        step_title = step.find('h3').text.strip() if step.find('h3') else ''
                        step_content = extract_text(step)
                        process_content += f"{step_title}: {step_content} "
                    data.append({
                        'source_url': url,
                        'title': f"{process_section_title}. {url_context}",
                        'content': process_content.strip(),
                        'index': index
                    })
                    index += 1


df = pd.DataFrame(data, columns=['source_url', 'title', 'content', 'index'])
df = df.dropna(subset=['content'])
df = df[df['content'].str.strip() != '']
df = df.reset_index(drop=True)
df['index'] = df.index


df.to_csv("./dataset/va_content.csv", index=False)


# Generate questions

In [3]:
import pandas as pd
import json
import openai
import time
import os 
import threading


In [26]:
class Cache:
    def __init__(self, persist_path, cache_loading_fn):
        self._cache = self._get_or_create_cache_dict(persist_path)
        self._persist_path = persist_path
        self._cache_loading_fn = cache_loading_fn
        self._cache_lock = threading.Lock()

    @classmethod
    def _get_or_create_cache_dict(cls, persist_path):
        if os.path.exists(persist_path):
            with open(persist_path) as f:
                cache = json.load(f)
        else:
            cache = {}
        return cache

    def _save_to_file(self):
        with open(self._persist_path, "w") as file:
            json.dump(self._cache, file)

    def _update_cache(self, key, value):
        with self._cache_lock:
            self._cache[key] = value
            self._save_to_file()

    def get_from_cache_or_load_cache(self, **kwargs):
        key = json.dumps(kwargs)

        with self._cache_lock:
            value = self._cache.get(key, None)

        if value is None:
            value = self._cache_loading_fn(**kwargs)
            self._update_cache(key, value)
        else:
            print("Loaded from cache")

        return value
    
    
    
# Cache setup (if needed)
cache = Cache("cache_file.json", lambda **kwargs: openai.ChatCompletion.create(**kwargs))

def cached_openai_ChatCompletion_create(**kwargs):
    cache = kwargs.pop("cache")
    return cache.get_from_cache_or_load_cache(**kwargs)

In [27]:
import pandas as pd
import openai
import time

openai.api_key = "your_api_key" 

def get_raw_response(content):
    prompt = f"""Please generate a question asking for the key information in the given paragraph.
    Also answer the questions using the information in the given paragraph.
    Please ask the specific question instead of the general question, like
    'What is the key information in the given paragraph?'.
    Please generate the answer using as much information as possible.
    If you are unable to answer it, please generate the answer as 'I don't know.'
    The answer should be informative and should be more than 3 sentences.

    Paragraph: {content}

    Question:"""

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Use the desired model
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.2,
    )
    
    return response

In [28]:
def generate_question_answer(content):
    if content is None or len(content) == 0:
        return "", "N/A"

    response = get_raw_response(content)
    try:
        text = response.choices[0].message["content"].strip()
        question, answer = text.split('Answer:')
        return question.strip(), answer.strip()
    except Exception as e:
        return str(e), "N/A"

In [29]:
def process_and_save_csv(input_csv, output_csv):
    df = pd.read_csv(input_csv)

    
    questions = []
    answers = []

    for index, row in df.iterrows():
        content = f"{row['title']}\n{row['content']}"
        question, answer = generate_question_answer(content)
        questions.append(question)
        answers.append(answer)
        time.sleep(1)

    df['question'] = questions
    df['answer'] = answers

    
    df.to_csv(output_csv, index=False)


In [31]:
input_csv = './dataset/va_content.csv' 
output_csv = './dataset/processed_va_questions.csv'  

process_and_save_csv(input_csv, output_csv)

In [32]:
df.head()

Unnamed: 0,source_url,title,content,index,question,answer
0,https://www.va.gov/disability/after-you-file-c...,The VA claim process after you file your claim...,Learn about what happens after you file your c...,0,What happens after you file a VA disability cl...,"After filing a VA disability claim, the claim ..."
1,https://www.va.gov/disability/after-you-file-c...,How long does it take VA to make a decision?. ...,147.2 days Average number of days to complete ...,1,What is the average number of days it takes fo...,The average number of days it takes for the VA...
2,https://www.va.gov/disability/after-you-file-c...,The time it takes to review your claim depends...,The time it takes to review your claim depends...,2,What factors determine the time it takes to re...,The time it takes to review a claim depends on...
3,https://www.va.gov/disability/after-you-file-c...,What should I do while I wait?. disability aft...,You don’t need to do anything unless we send y...,3,What actions should be taken while waiting for...,While waiting for a disability claim to be pro...
4,https://www.va.gov/disability/after-you-file-c...,What happens after I file a VA disability clai...,Claim received We’ll let you know when we rece...,4,What are the steps in the process that occurs ...,"After filing a VA disability claim, the first ..."
