In [4]:
import requests
from bs4 import BeautifulSoup
import re
import json
import contractions
from urllib.parse import urlparse
import concurrent.futures

In [5]:
with open('gptrc_urls.json', 'r') as file:
    # Read the content of the file
    gptrc_urls = json.load(file)
    
print(len(gptrc_urls))

12451


In [6]:
import pandas as pd

gptrc_file = 'filtered_gptrc_data.xlsx'

# Read the Excel file and load data into a DataFrame
gptrc_data_original = pd.read_excel(gptrc_file)
gptrc_data = gptrc_data_original
gptrc_data.head()

Unnamed: 0,question,keywords,unfiltered_answer,unfiltered_answer_length
0,How would I go about finding a burried treasure?,"finding, buried treasure",Finding buried treasure is often the stuff of ...,237
1,Do you consider AI a threat to human civilizat...,"AI, threat, human civilization","AI, like any powerful technology, has the pote...",182
2,Are you familiar with the concept of Yin and Y...,"concept, Yin and Yang","Yes, I am familiar with the concept of Yin and...",255
3,"What do you understand by the term, ""copyright""?",copyright,Copyright is a legal term that refers to the e...,119
4,Do ghosts exist?,"ghosts, exist",The question of whether ghosts exist is a matt...,124


In [7]:
for question in gptrc_urls:
    # Find the corresponding unfiltered answer in the dataframe
    unfiltered_answer = gptrc_data.loc[gptrc_data['question'] == question['question'], 'unfiltered_answer'].values
    if unfiltered_answer.size > 0:
        question['llm_answer'] = unfiltered_answer[0]
    else:
        question['llm_answer'] = None

def write_results_to_file(results, output_file):
    with open(output_file, "w") as json_file:
         json.dump(results, json_file)
    print(f"Results have been saved to {output_file}")
    
# Save the modified data to a new JSON file
write_results_to_file(gptrc_urls, "gptrc_urls_with_llm_answers.json")

Results have been saved to gptrc_urls_with_llm_answers.json


In [8]:

# Domains considered as generally reliable for factual information
reliable_domains = [
    "harvard.edu", "stanford.edu", "gatech.edu", "uc.edu", "cam.ac.uk",
    "nasa.gov", "noaa.gov", "fda.gov", "gov.uk", "europa.eu",
    "mayoclinic.org", "hopkinsmedicine.org", "nih.gov",
    "nature.com", "sciencemag.org", "nejm.org",
    "britannica.com", "wikipedia.org",
    "bbc.com", "nytimes.com", "washingtonpost.com"
]

# Function to filter URLs based on the domain reliability list
def filter_urls_by_domain(gptrc_urls, reliable_domains):
    from urllib.parse import urlparse

    # Helper function to check if a domain is in the list of reliable domains
    def is_reliable(url):
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        # Removing 'www.' if present for consistent comparison
        clean_domain = domain.replace("www.", "")
        for reliable_domain in reliable_domains:
            if clean_domain.endswith(reliable_domain):
                return True
        return False

    # Filtering URLs
    for item in gptrc_urls:
        item["urls"] = [url for url in item["urls"] if is_reliable(url)]
    
    return gptrc_urls

# Filter the URLs and update the gptrc_urls list
gptrc_reliable_urls = filter_urls_by_domain(gptrc_urls, reliable_domains)

In [9]:
write_results_to_file(gptrc_reliable_urls, "gptrc_reliable_urls.json")

Results have been saved to gptrc_reliable_urls.json


In [10]:
def remove_empty_url_elements(gptrc_urls):
    # Filtering out elements where the 'urls' list is empty
    gptrc_reliable_urls = [item for item in gptrc_urls if item["urls"]]
    return gptrc_reliable_urls

# Remove elements with empty URL lists
filtered_gptrc_urls = remove_empty_url_elements(gptrc_reliable_urls)
print(len(filtered_gptrc_urls))

3110


In [11]:
write_results_to_file(filtered_gptrc_urls, "filtered_gptrc_urls.json")

Results have been saved to filtered_gptrc_urls.json


In [10]:
def preprocess_text(text):
    clean_text = re.sub(r'\[\w+\]', '', text)  # Remove in-text citations (e.g., [1], [2], [3], [a], [b], [c]...)
    clean_text = re.sub(r'\n+', ' ', clean_text)  # Replace newlines with space
    clean_text = re.sub(r'\t+', ' ', clean_text)  # Replace tabs with space
    clean_text = re.sub(r'\s+', ' ', clean_text)  # Replace multiple spaces with a single space
    clean_text = contractions.fix(clean_text)  # Expand contractions (e.g., "can't" to "cannot")
    clean_text = clean_text.encode('ascii', 'ignore').decode('ascii') # Remove or replace non-ASCII characters
    
    return clean_text

def fetch_information(url):
    try:
        response = requests.get(url, timeout=10)
        cleaned_paragraphs = []
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            paragraphs = soup.find_all('p')
            for paragraph in paragraphs:
                paragraph_text = paragraph.get_text().strip()
                if paragraph_text and len(paragraph_text) > 30:
                   cleaned_text = preprocess_text(paragraph_text)
                   cleaned_paragraphs.append(cleaned_text)
            if cleaned_paragraphs:
                cleaned_paragraphs = ' '.join(cleaned_paragraphs)
                return cleaned_paragraphs
            else:
                return "No relevant information found." 
        else:
            return "Failed to fetch the webpage."
    except requests.exceptions.RequestException as e:
        return f"Request failed: {e}"    
    
def fetch_information_for_question(item):
    question_info = {"question": item["question"], "urls_info": []}
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(fetch_information, url): url for url in item["urls"]}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
            except Exception as exc:
                data = f"Error fetching data: {exc}"
            question_info["urls_info"].append({"url": url, "context": data})
    return question_info

def fetch_information_from_urls_parallel(gptrc_urls):
    all_questions_info = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(fetch_information_for_question, item) for item in gptrc_urls]
        for future in concurrent.futures.as_completed(futures):
            all_questions_info.append(future.result())
    return all_questions_info

In [12]:
results = fetch_information_from_urls_parallel(filtered_gptrc_urls)

In [23]:
urls_file_path = "gptrc_urls_context.json"
with open(urls_file_path, "w") as json_file:
    json.dump(results, json_file)

print(f"Results have been saved to {urls_file_path}")

Results have been saved to gptrc_urls_context.json
