<a href="https://colab.research.google.com/github/bhavya-1204/AI-human-texts/blob/main/wikipedia_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [2]:
# Check if Dset.csv exists
if os.path.exists('Dset.csv'):
    data = pd.read_csv('Dset.csv')
    print("Loaded 'Dset.csv' into 'data' DataFrame.")
else:
    print('Add Dset.csv file first!')

if os.path.exists('name.csv'):
    names = pd.read_csv('name.csv')
    print("Loaded 'name.csv' into 'names' DataFrame.")
else:
    print('Add name.csv file first!')


Loaded 'Dset.csv' into 'data' DataFrame.
Loaded 'name.csv' into 'names' DataFrame.


In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'}

In [7]:
new_name_to_scrape = input("Enter the name to append from Wikipedia: ")

# Fix 2: Correct Name Existence Check
# Check if the new_name_to_scrape exists in the first column of the 'names' DataFrame
if new_name_to_scrape in names.iloc[:, 0].values:
    print(f"'{new_name_to_scrape}' already exists in the list of scraped names. No new data will be added.")
else:
    print(f"Attempting to scrape data for '{new_name_to_scrape}'...")
    try:
        # Scrape the Wikipedia page
        webpage = requests.get(f'https://en.wikipedia.org/wiki/{new_name_to_scrape}', headers=headers).text
        soup = BeautifulSoup(webpage, 'lxml')

        # Find the main content div for paragraphs
        main_content_div = soup.find('div', class_='mw-content-ltr mw-parser-output')

        if main_content_div:
            # Extract all non-empty paragraph texts
            new_paragraphs = [p.text.strip() for p in main_content_div.find_all('p') if p.text.strip()]

            if new_paragraphs:
                # Fix 3: Correct Name Appending Logic
                # Use pd.concat to append the new name to the 'names' DataFrame
                names = pd.concat([names, pd.DataFrame([new_name_to_scrape], columns=names.columns)], ignore_index=True)

                # Fix 4: Ensure Scraped Paragraphs Column Consistency
                # Create a DataFrame for the new paragraphs with column names consistent with 'data'
                new_data_df = pd.DataFrame(new_paragraphs, columns=data.columns)

                # Concatenate with the existing 'data' DataFrame
                data = pd.concat([data, new_data_df], ignore_index=True)

                # Save the updated DataFrame to Dset.csv
                data.to_csv('Dset.csv', index=False)

                print(f"Successfully scraped and appended data for '{new_name_to_scrape}' to Dset.csv.")
            else:
                print(f"No significant paragraph data found on Wikipedia for '{new_name_to_scrape}'.")
        else:
            print(f"Could not find the main content div on Wikipedia for '{new_name_to_scrape}'.")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching Wikipedia page for '{new_name_to_scrape}': {e}")
    except Exception as e:
        print(f"An unexpected error occurred during scraping for '{new_name_to_scrape}': {e}")

Enter the name to append from Wikipedia:  f1


Attempting to scrape data for 'f1'...
Successfully scraped and appended data for 'f1' to Dset.csv.
