The task involved scraping data from numerous web sources to create files comprising paragraphs about the US Presidential Elections to build a 200 files dataset. The Python package Beautiful Soup was used because of its strong parsing capabilities, which made it possible to extract useful information from HTML texts quickly. The work was made easier by using Beautiful Soup, which allowed for the deliberate retrieval of paragraphs while maintaining data integrity and following predetermined word count requirements.

In [None]:
#import necessary libraries and packages
import requests
from bs4 import BeautifulSoup
import os
import random
import re
import time

In [None]:
# this function is used to web scrape data from specific passed urls, 
# making sure there is no repetition and that the paragraphs have between 80 and 150 words
def webScrape_Data(url, visited_par):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        
        # Extract paragraphs with word count between 80 and 150 words and not in visited paragraphs
        unvisited_par = [p.get_text() for p in paragraphs if 80 <= len(p.get_text().split()) <= 150 and p.get_text() not in visited_par]
        
        if unvisited_par:
            return random.choice(unvisited_par)
        else:
            return None
    except Exception as e:
        # Handle exceptions gracefully
        print("Error occurred during web scraping:", e)
        return None


In [None]:
# this function is used to write the data on a file
def write_data(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(data)

In [None]:
#create a folder named US_Presidential_Elections to store the 200 files
if not os.path.exists('US_Presidential_Elections'):
    os.makedirs('US_Presidential_Elections')

In [None]:
#store some urls in a list to scrape from
urls = [
    "https://edition.cnn.com/election",
    "https://en.wikipedia.org/wiki/2024_United_States_presidential_election",
    "https://www.nytimes.com/interactive/2023/us/politics/presidential-candidates-2024.html",
    "https://en.wikipedia.org/wiki/United_States_presidential_election",
    "https://www.usa.gov/election",
    "https://www.reuters.com/world/us/who-are-candidates-running-2024-us-presidential-election-2023-09-19/",
    "https://th.usembassy.gov/summary-of-the-u-s-presidential-election-process/",
    "https://www.presidency.ucsb.edu/statistics/elections",
    "https://en.wikipedia.org/wiki/2020_United_States_presidential_election",
    "https://www.britannica.com/topic/United-States-Presidential-Election-Results-1788863",
    "https://projects.fivethirtyeight.com/polls/president-general/2024/national/",
    "https://www.france24.com/en/tag/us-presidential-election/",
    "https://www.history.com/news/most-contentious-u-s-presidential-elections",
    "https://www.loc.gov/classroom-materials/united-states-history-primary-source-timeline/post-war-united-states-1945-1968/presidential-election-1960/",
    "https://dk.usembassy.gov/usa-i-skolen/presidential-elections-and-the-american-political-system/",
    "https://www.archives.gov/news/topics/presidential-elections",
    "https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwizxq_z84iFAxU5nGgJHfTNBLoYABAAGgJ3Zg&ase=2&gclid=Cj0KCQjw2PSvBhDjARIsAKc2cgOt9iPkboVYiQwhlvu3Ar3mSCGktahpljYlaWfnnR56_8UtU1TnXyoaAp6VEALw_wcB&ohost=www.google.com&cid=CAESVeD26of5leZt2PBnOvY2-rdtcc9amxpUAkWGBg6z8ixQnr-gSDqBlY4DFTVf-fuBGwSdmG7v0xQonWsaxDt0xaLsE2zKbawhccGh4TuYzMAnfAVRpkE&sig=AOD64_20Xk0_6vsKor5OOiIuONMwaU5fiA&q&nis=4&adurl&ved=2ahUKEwiE_6bz84iFAxWzT6QEHdJuAuYQ0Qx6BAgNEAE"
    ,"https://www.usa.gov/presidential-general-election",
    "https://www.270towin.com/historical-presidential-elections/",
    "https://www.livemint.com/news/world/us-presidential-election-process-explained-electoral-college-caucuses-primaries-donald-trump-vivek-ramaswamy-joe-biden-11705420470068.html"
    ,"https://ballotpedia.org/Presidential_election,_2024",
    "https://www.270towin.com/states/",
    "https://www.statista.com/topics/6273/us-presidential-elections-1789-2016/",
    "https://www.census.gov/newsroom/press-releases/2021/2020-presidential-election-voting-and-registration-tables-now-available.html",
    "https://www.mountvernon.org/library/digitalhistory/digital-encyclopedia/article/presidential-election-of-1789/",
    "https://www.statista.com/statistics/1034688/share-electoral-popular-votes-each-president-since-1789/",
    "https://www.history.com/topics/us-presidents/presidential-election-facts",
    "https://guides.loc.gov/american-history",
    "https://www.voanews.com/a/a-13-2008-11-04-voa71/401374.html",
    "https://www.everycrsreport.com/reports/RL30527.html",
     "https://www.loc.gov/collections/century-of-lawmaking/articles-and-essays/century-presentations/presidential-elections/",
     "https://www.archives.gov/electoral-college/key-dates",
     "https://www.cnn.com/politics/us-primaries-explained-what-matters/index.html",
     "https://www.usbank.com/investing/financial-perspectives/market-news/how-presidential-elections-affect-the-stock-market.html"
    ,"https://www.brookings.edu/articles/reflections-on-the-2000-u-s-presidential-election/","https://libguides.rowan.edu/c.php?g=557440"
]

In [None]:
#create an empty set
visited_par = set()

In [None]:
# this is to create 200 files named with incrementing numbers from 1-200
for i in range(133, 201):
    # loop over all urls given to try for scraping
    for url in urls:
        # scrape data from the chosen URL
        data = webScrape_Data(url, visited_par)
        # check if data was successfully scraped and meets the length requirement, then use it 
        if data:  
            file_name = f'US_Presidential_Elections/file_{i}.txt'
            write_data(data, file_name)
            visited_par.add(data)
            print(f"File {i} created.")
            # Sleep to avoid overwhelming the server
            time.sleep(random.uniform(1, 3))
            # go to next file if found
            break
        else:
            print("Failed to scrape data from the current URL. Trying a different URL.")
            

    else:
        print("Failed to scrape data from all URLs.")


In [None]:


# I created this function to ensure uniqueness among all files and to detect any duplocations
def check_uniqueness(directory):
    file_content_mapping = {}  # Dictionary i used to store file content and their filenames
    
    # Loop over all files in the directory
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                # Check if the content of the file is already in file_content_mapping
                if content in file_content_mapping:
                    file_content_mapping[content].append(filename)
                else:
                    file_content_mapping[content] = [filename]
    
    # Extract unique files and duplicate files
    duplicate_files = {content: filenames for content, filenames in file_content_mapping.items() if len(filenames) > 1}
    
    return duplicate_files

directory = 'US_Presidential_Elections'

duplicate_files = check_uniqueness(directory)


# Print duplicate files
print("\nDuplicate Files:")
for content, filenames in duplicate_files.items():
    print(f"Content: of duplicates \n{content}\nDuplicate Files:")
    for filename in filenames:
        print(filename)
    print()
