In [None]:
import os
import time
import pandas as pd
import requests
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup
from openai import OpenAI

In [None]:
def generate_summary_with_openai(text, client, prompt):
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",  # Chat model
            messages=[{"role": "system", "content": prompt},
                      {"role": "user", "content": text}]
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error in generating summary: {e}"


In [None]:
import requests
from bs4 import BeautifulSoup, NavigableString

def fetch_death_section_summary(page_title, death_keywords, client):
    """
    Fetches and uses OpenAI to summarize a Wikipedia page section based on death keywords.

    Args:
    page_title (str): Title of the Wikipedia page.
    death_keywords (list): List of keywords to search for in section titles.
    openai_api_key (str): OpenAI API key for summary generation.

    Returns:
    tuple: Tuple containing a brief cause of death and a detailed OpenAI-generated summary.
    """
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'parse',
        'page': page_title,
        'prop': 'text',
        'format': 'json'
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()

        soup = BeautifulSoup(response.json()['parse']['text']['*'], 'html.parser')
        headings = soup.find_all('span', class_='mw-headline')
        death_section_heading = None
        for heading in headings:
            if any(keyword.lower() in heading.text.lower() for keyword in death_keywords):
                death_section_heading = heading
                break

        if death_section_heading:
            content = []
            for sibling in death_section_heading.find_parent().find_next_siblings():
                if sibling.name in ['h2', 'h3']:
                    # Stop if next heading of the same level is found
                    break
                if sibling.name in ['p', 'ul', 'ol']:
                    content.append(sibling.get_text().strip())

            full_text = ' '.join(content).strip()
            if full_text:
                detailed_summary = generate_summary_with_openai(full_text, 
                                                                client,
                                                                "Summarize the following text:")
                cause_of_death = generate_summary_with_openai(full_text,
                                                              client,
                                                              "Return the cause of death in the shortest form possible:")
                return cause_of_death, detailed_summary

        return "No relevant section found", "No relevant section found"

    except requests.RequestException as e:
        return f"Error: {e}", f"Error: {e}"

In [None]:
def augment_dataframe_with_death_summaries(csv_file_path, death_keywords, client, chunk_size=25, sleep_time=1):
    """
    [Previous Docstring]
    """
    # Load the existing data
    df = pd.read_csv(csv_file_path)

    # Filter out records that have already been processed (i.e., do not have 'nan' in 'Cause_of_Death')
    unprocessed_df = df[df['Cause_of_Death'].isna() & df['Death_Summary'].isna()]

    total_rows = len(unprocessed_df)
    processed_rows = 0

    for index, row in unprocessed_df.iterrows():
        try:
            # Fetch the death section summary only for unprocessed records
            cause, summary = fetch_death_section_summary(row['WIKI_PAGE'], death_keywords, client)
            df.at[index, 'Cause_of_Death'] = cause
            df.at[index, 'Death_Summary'] = summary
        except HTTPError as http_err:
            print(f"HTTP error occurred: {http_err}. Pausing for 30 minutes...")
            time.sleep(1800)  # Wait for 30 minutes
            continue

        # Increment the processed_rows counter
        processed_rows += 1

        # Save progress after processing each chunk
        if processed_rows % chunk_size == 0 or processed_rows == total_rows:
            df.to_csv('wiki_died_output.csv', index=False)
            print(f"Processed {processed_rows}/{total_rows} rows. Sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)

    return df

In [None]:

OPEN_AI = os.environ.get("OPEN_AI")
client = OpenAI(api_key=OPEN_AI)

# Define your OpenAI API key and the list of death-related keywords
health_keywords = ["Health", "Personal Life"]
death_keywords = ["Death", "Assassination", "Demise", "Murder", "Passed Away"]

# Assuming you have a CSV file named 'wiki_pages.csv' with a column 'WIKI_PAGE'
# Replace '/path/to/your/wiki_pages.csv' with the actual path to your CSV file
updated_df = augment_dataframe_with_death_summaries('wiki_died_input.csv', death_keywords, client)