| **Author**          | **Roll No**   | **Version** |
|---------------------|----------------|--------------|
| Abhyudaya Nair      | 24210005      | 1.0          |

### Link Crawling

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import os
from urllib.parse import urljoin, urlparse

# Define the base URL
base_url = "https://www.propublica.org/"

# Define the file to store article links
csv_file = 'Abhyudaya/Scripts and Data/propublicadotorg_article_links.csv'
progress_file = 'Abhyudaya/Scripts and Data/links_to_traverse.txt'  # File to store progress

# Store visited links to avoid reprocessing and duplicates
visited_links = set()
links_to_traverse = []  # Initialize as empty list
article_links = set()  # To store unique article links

# Write the article link to CSV
def save_article_link(link):
    with open(csv_file, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([link])

# Function to fetch and parse the HTML content
def get_html_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return None

# Extracts all unique links from a given page
def extract_links(soup, base_url):
    links = set()
    for anchor in soup.find_all('a', href=True):
        href = anchor['href']
        full_url = urljoin(base_url, href)  # Construct full URL

        # Ensure the link belongs to propublica.org and is a valid HTTP link
        if "propublica.org" in urlparse(full_url).netloc and full_url.startswith("http"):
            links.add(full_url)
    return links

# Process the current URL and extract new links
def process_link(url):
    soup = get_html_content(url)
    if soup is None:
        return

    # Extract all unique links from the current page
    links = extract_links(soup, url)

    # Filter and save article links
    for link in links:
        if "propublica.org/article" in link and link not in article_links:
            article_links.add(link)
            save_article_link(link)
            print(f"Saved article link: {link}")

    # Add unvisited links to the links_to_traverse list
    for link in links:
        if link not in visited_links:
            links_to_traverse.append(link)

# Load previously saved links from the progress file
def load_progress():
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as file:
            return [line.strip() for line in file.readlines()]
    return []

# Save the current state of links_to_traverse to the progress file
def save_progress():
    with open(progress_file, 'w') as file:
        for link in links_to_traverse:
            file.write(link + '\n')

# Main crawling loop
links_to_traverse = load_progress() or [base_url]  # Load progress or start fresh

while links_to_traverse:
    current_link = links_to_traverse.pop(0)
    if current_link not in visited_links:
        print(f"Processing: {current_link}")
        visited_links.add(current_link)
        process_link(current_link)
        
        # Save progress after processing each link
        save_progress()

# Final save of progress when done
save_progress()
print("Crawling complete. Progress saved.")


Processing: https://www.propublica.org/
Saved article link: https://www.propublica.org/article/texas-medicaid-unwinding-consequences
Saved article link: https://www.propublica.org/article/gaza-palestine-israel-blocked-humanitarian-aid-blinken
Saved article link: https://www.propublica.org/article/wisconsin-lac-du-flambeau-tribe-lending-brian-coughlin-bankruptcy-lawsuit
Saved article link: https://www.propublica.org/article/abortion-pills-safety-questions-answered
Saved article link: https://www.propublica.org/article/telegram-terrorgram-collective-extremism-accelerationists-dallas-humber-matthew-allison
Saved article link: https://www.propublica.org/article/epa-scientists-faced-retaliation-after-finding-harm-from-chemicals
Saved article link: https://www.propublica.org/article/georgia-abortion-ban-amber-thurman-death
Saved article link: https://www.propublica.org/article/candi-miller-abortion-ban-death-georgia
Saved article link: https://www.propublica.org/article/supreme-court-chevron

  k = self.parse_starttag(i)


Processing: https://www.propublica.org/people/eli-hager
Saved article link: https://www.propublica.org/article/child-welfare-search-seizure-without-warrants
Saved article link: https://www.propublica.org/article/how-we-analyzed-child-welfare-investigation-data
Saved article link: https://www.propublica.org/article/arizona-matthew-stewart-katie-hobbs-dcs
Saved article link: https://www.propublica.org/article/arizona-school-vouchers-budget-meltdown
Saved article link: https://www.propublica.org/article/for-black-families-in-phoenix-child-welfare-investigations-are-constant-threat
Saved article link: https://www.propublica.org/article/some-constitutional-rights-dont-apply-in-child-welfare
Saved article link: https://www.propublica.org/article/more-states-allow-child-support-to-reach-children
Saved article link: https://www.propublica.org/article/new-mexico-lost-juveniles-in-prison
Saved article link: https://www.propublica.org/article/expert-in-foster-care-cases-admits-her-method-is-unsci

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2022-1st-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2021-2nd-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/p2
Processing: https://assets-c3.propublica.org/pdf/reports/2021-Pro-Publica-Form-990.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/#footer-links
Processing: https://assets-c3.propublica.org/pdf/reports/Propublica-Annual-Report-2023.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2021-1st-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2021AnnualReportFinal2.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2020-Form-990-Pro-Publica-Inc-1.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/Propublica-2023-2nd-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/ProPublica-2024-1st-Impact-Report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2022-Financial-Statements-for-Pro-Publica-Inc.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/cdn-cgi/l/email-protection#87eae2e3eee6f5eee0eff3f4c7f7f5e8f7f2e5ebeee4e6a9e8f5e0
Processing: https://assets-c3.propublica.org/pdf/reports/2019-Form-990-Pro-Publica-Inc.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/#main
Processing: https://assets-c3.propublica.org/pdf/reports/2021-Financial-Statements-for-Pro-Publica-Inc.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/p4
Processing: https://assets-c3.propublica.org/pdf/reports/Pro-Publica-Financial-Statement-2023.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2020-annual-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/Propublica-2022-2nd-interim-report_2022-09-28-202019_jrtc.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2023-ProPublica-Form-990.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2020-Financial-Statements-for-Pro-Publica-Inc.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/Propublica-2023-1st-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/cdn-cgi/l/email-protection#f894919b9d968b91969fb8888a97888d9a94919b99d6978a9f
Processing: https://assets-c3.propublica.org/pdf/reports/2022-Annual-Report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/p3
Processing: https://www.propublica.org/series/zero-trust
Processing: https://www.propublica.org/cdn-cgi/l/email-protection#305c5953555e43595e577040425f4045525c5953511e5f4257
Processing: https://www.propublica.org/article/cyber-safety-board-never-investigated-solarwinds-breach-microsoft
Saved article link: https://www.propublica.org/article/cyber-safety-board-never-investigated-solarwinds-breach-microsoft#main
Processing: https://www.propublica.org/series/power-hungry
Processing: https://www.propublica.org/article/telegram-terrorgram-collective-extremism-accelerationists-dallas-humber-matthew-allison#main
Processing: https://www.propublica.org/people/jeff-kao
Saved article link: https://www.propublica.org/article/a-visionary-without-a-country-chinese
Saved article link: https://www.propublica.org/article/liu-tao-trump-meeting-china-investigation
Saved article link: https://www.propublica.org/article/a-visionary-without-a-country
Saved ar

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-d.propublica.org/v5/images/stillbirthmemorial_social-media-assets.zip


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/article/how-to-participate-in-propublicas-stillbirths-memorial#main
Processing: https://projects.propublica.org/stillbirth-memorial
Processing: https://www.propublica.org/article/st-lukes-in-houston-blistering-report-details-serious-safety-lapses
Saved article link: https://www.propublica.org/article/st-lukes-in-houston-blistering-report-details-serious-safety-lapses#footer-links
Saved article link: https://www.propublica.org/article/st-lukes-in-houston-blistering-report-details-serious-safety-lapses#modal-republish
Saved article link: https://www.propublica.org/article/st-lukes-in-houston-blistering-report-details-serious-safety-lapses#main
Processing: https://www.propublica.org/article/prominent-texas-surgeon-sues-propublica-and-the-houston-chronicle
Saved article link: https://www.propublica.org/article/prominent-texas-surgeon-sues-propublica-and-the-houston-chronicle#footer-links
Saved article link: https://www.propublica.org/article/prominent

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/article/join-us-for-an-event-about-our-investigation-into-arizonas-intellectual-and-developmental-disability-services
Saved article link: https://www.propublica.org/article/join-us-for-an-event-about-our-investigation-into-arizonas-intellectual-and-developmental-disability-services#modal-republish
Saved article link: https://www.propublica.org/article/join-us-for-an-event-about-our-investigation-into-arizonas-intellectual-and-developmental-disability-services#footer-links
Processing: https://www.propublica.org/article/arizona-developmental-disabilities-hacienda-healthcare-phoenix
Saved article link: https://www.propublica.org/article/arizona-developmental-disabilities-hacienda-healthcare-phoenix#footer-links
Saved article link: https://www.propublica.org/article/arizona-developmental-disabilities-hacienda-healthcare-phoenix#modal-republish
Processing: https://www.propublica.org/article/they-made-a-revolutionary-system-to-protect-people-with-develo

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/article/i-thought-arizona-was-rated-high-for-disability-services-but-that-is-wrong-plain-text
Saved article link: https://www.propublica.org/article/i-thought-arizona-was-rated-high-for-disability-services-but-that-is-wrong-plain-text#modal-republish
Saved article link: https://www.propublica.org/article/i-thought-arizona-was-rated-high-for-disability-services-but-that-is-wrong-plain-text#footer-links
Processing: https://assets-c3.propublica.org/20201112_-sm-az_mainbar_ppstar-copyfinal-1.mp3


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/article/she-needs-a-device-to-communicate-the-state-has-kept-it-from-her-for-18-months
Saved article link: https://www.propublica.org/article/she-needs-a-device-to-communicate-the-state-has-kept-it-from-her-for-18-months#footer-links
Saved article link: https://www.propublica.org/article/she-needs-a-device-to-communicate-the-state-has-kept-it-from-her-for-18-months#modal-republish
Processing: https://assets-c3.propublica.org/20201106-stumpfsidebar.mp3


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/article/he-has-a-developmental-disability-and-needs-a-caretaker-the-state-suggested-diapers-instead
Saved article link: https://www.propublica.org/article/he-has-a-developmental-disability-and-needs-a-caretaker-the-state-suggested-diapers-instead#modal-republish
Saved article link: https://www.propublica.org/article/he-has-a-developmental-disability-and-needs-a-caretaker-the-state-suggested-diapers-instead#footer-links
Processing: https://www.propublica.org/article/prometieron-ayuda-para-las-personas-con-discapacidades-de-desarrollo-en-lugar-de-recibirla-enfrentan-retrasos-y-rechazos
Saved article link: https://www.propublica.org/article/prometieron-ayuda-para-las-personas-con-discapacidades-de-desarrollo-en-lugar-de-recibirla-enfrentan-retrasos-y-rechazos#footer-links
Saved article link: https://www.propublica.org/article/prometieron-ayuda-para-las-personas-con-discapacidades-de-desarrollo-en-lugar-de-recibirla-enfrentan-retrasos-y-rechazos#modal

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/getinvolved/whats-your-experience-with-intellectual-and-developmental-disability-care-in-arizona
Processing: https://assets-c3.propublica.org/20201113_drewsidebar-plaintext.mp3


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/atpropublica/propublica-and-partners-nominated-for-seven-ona-awards
Processing: https://www.propublica.org/article/arizona-promised-to-help-people-with-developmental-disabilities-but-some-had-to-wait-a-long-time-some-did-not-get-help-at-all-plain-text
Saved article link: https://www.propublica.org/article/arizona-promised-to-help-people-with-developmental-disabilities-but-some-had-to-wait-a-long-time-some-did-not-get-help-at-all-plain-text#modal-republish
Saved article link: https://www.propublica.org/article/arizona-promised-to-help-people-with-developmental-disabilities-but-some-had-to-wait-a-long-time-some-did-not-get-help-at-all-plain-text#footer-links
Processing: https://www.propublica.org/article/plain-text-arizona-developmental-disabilities-hacienda-phoenix
Saved article link: https://www.propublica.org/article/plain-text-arizona-developmental-disabilities-hacienda-phoenix#footer-links
Saved article link: https://www.propublica.org/article/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/atpropublica/propublica-local-reporting-network-partner-amy-silverman-named-arizona-journalist-of-the-year
Processing: https://www.propublica.org/atpropublica/two-propublica-local-reporting-network-projects-named-finalists-for-shadid-award-for-journalism-ethics
Processing: https://www.propublica.org/article/he-has-a-developmental-disability-he-needs-a-helper-arizona-said-he-could-wear-diapers-instead
Saved article link: https://www.propublica.org/article/he-has-a-developmental-disability-he-needs-a-helper-arizona-said-he-could-wear-diapers-instead#modal-republish
Saved article link: https://www.propublica.org/article/he-has-a-developmental-disability-he-needs-a-helper-arizona-said-he-could-wear-diapers-instead#footer-links
Processing: https://www.propublica.org/article/join-us-to-talk-about-the-stories-we-wrote-about-people-with-developmental-disabilities-in-arizona
Saved article link: https://www.propublica.org/article/join-us-to-talk-about-the-s

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/article/how-we-wrote-the-story
Saved article link: https://www.propublica.org/article/how-we-wrote-the-story#modal-republish
Saved article link: https://www.propublica.org/article/how-we-wrote-the-story#footer-links
Processing: https://www.propublica.org/article/illinois-crisis-institution-placement#main
Processing: https://www.propublica.org/article/illinois-choate-employee-camera-caught-beating-patient
Saved article link: https://www.propublica.org/article/illinois-choate-employee-camera-caught-beating-patient#main
Processing: https://www.propublica.org/article/muscular-dystrophy-patient-olympic-medalist-same-genetic-mutation
Saved article link: https://www.propublica.org/article/muscular-dystrophy-patient-olympic-medalist-same-genetic-mutation#disqus_thread
Saved article link: https://www.propublica.org/article/muscular-dystrophy-patient-olympic-medalist-same-genetic-mutation#update
Processing: https://www.propublica.org/article/surgery-risks-p

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2017-1st-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2018-Form-990.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-990-2017.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2020-1st-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-illinois-report-for-2018.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/p2#main
Processing: https://www.propublica.org/reports/p2#footer-links
Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2017-2nd-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2019-annual-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2017-Financial-Statements-for-Pro-Publica-Inc.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-FS-2016.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2019-Financial-Statements-for-Pro-Publica-Inc.PDF


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2018-annual-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2018-2nd-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2016-annual-report_170210_153050.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2018-1st-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2017-annual-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_990_2016.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2019-2nd-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/2018-Financial-Statements-for-Pro-Publica-Inc.-PDF.PDF


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2019-1st-interim-report.pdf.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_990_2011.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/ProPublica_FS_2011.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/Pro_Publica_FS09.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/PP_2010_annualrep_forWEB.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/Pro_Publica_FS_2008.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/990-for2009.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_990_2010.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/p4#main
Processing: https://assets-c3.propublica.org/pdf/reports/ProPublica-Reports-September-2010.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/p4#footer-links
Processing: https://assets-c3.propublica.org/pdf/reports/PP_Report_to_Stakeholders_May-August_2012.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/Pro_Publica_FS_2010.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/990-ProPublica-2007.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/PP_2011_May-Aug_Interim_report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_report_may2010.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/assets/about/2011-Annual-Report_final.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/PP_Report_to_Stakeholders_January-April_2012.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_2012report_final_170210_164448.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/990-ProPublica-2008.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_2013report_final_170210_155501.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2014-annual-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_990_2014.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2016-1st-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2015-1st-interim-report_170210_154303.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/reports/p3#main
Processing: https://www.propublica.org/reports/p3#footer-links
Processing: https://assets-c3.propublica.org/pdf/reports/ProPublica_FS_2013_170210_155021.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_990_2015.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica_990_2012.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/ProPublica-990-for-2013.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2016-2nd-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/ProPub_2013_1streport_final.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/assets/about/ProPub-1st-Interim-Rep-2014_r3.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2014-2nd-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2015-annual-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/assets/about/ProPublica-Financial-Statements-2014.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/ProPublica-FS-2012.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/propublica-2015-2nd-interim-report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/pdf/reports/ProPublica_2013_2ndreport_final.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://assets-c3.propublica.org/assets/about/2015-Pro-Publica-Inc-Financial-Statements.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing: https://www.propublica.org/series/zero-trust#main
Processing: https://www.propublica.org/cdn-cgi/l/email-protection#f49991909d95869d939c8087b484869b848196989d9795da9b8693
Processing: https://www.propublica.org/cdn-cgi/l/email-protection#c685b4a7afa1e895afaab0a3b4aba7a886b6b4a9b6b3a4aaafa5a7e8a9b4a1
Processing: https://www.propublica.org/article/cyber-safety-board-never-investigated-solarwinds-breach-microsoft#main
Processing: https://www.propublica.org/series/power-hungry#main
Processing: https://www.propublica.org/people/jeff-kao/p2
Processing: https://www.propublica.org/article/a-visionary-without-a-country-chinese
Saved article link: https://www.propublica.org/article/a-visionary-without-a-country-chinese#main
Processing: https://www.propublica.org/article/liu-tao-trump-meeting-china-investigation
Saved article link: https://www.propublica.org/article/liu-tao-trump-meeting-china-investigation#main
Processing: https://www.propublica.org/people/jeff-kao#main
Processing: ht

### Text Extraction

In [None]:
import csv
import requests
import os
from bs4 import BeautifulSoup

# File paths
csv_file_path = 'Abhyudaya/Scripts and Data/propublicadotorg_article_links.csv'
visited_links_file = 'Abhyudaya/Scripts and Data/propublicadotorg_visited_links.txt'
txt_files_directory = 'Abhyudaya/Text Files/propublicadotorg'

# Ensure the directory exists
if not os.path.exists(txt_files_directory):
    os.makedirs(txt_files_directory)

# Read the links from the CSV file
def get_links(csv_file_path):
    with open(csv_file_path, newline='', encoding='utf-8') as file:
        return [row[0] for row in csv.reader(file) if row]

# Read the visited links from visited_links.txt
def get_visited_links(visited_links_file):
    if os.path.exists(visited_links_file):
        with open(visited_links_file, 'r', encoding='utf-8') as file:
            return set(line.strip() for line in file)
    return set()

# Write the visited link to visited_links.txt
def add_to_visited(link):
    with open(visited_links_file, 'a', encoding='utf-8') as file:
        file.write(link + '\n')

# Fetch content from the link and extract the relevant text using BeautifulSoup
def fetch_and_extract_text(link):
    try:
        response = requests.get(link)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the div with class="article-body"
        article_body = soup.find('div', class_='article-body')
        if not article_body:
            print(f"No valid article-body found in {link}, skipping.")
            return None

        # First, try to extract text from <p> tags with data-pp-blocktype="copy"
        paragraphs = [p.get_text() for p in article_body.find_all('p', {'data-pp-blocktype': 'copy'})]

        # If no paragraphs with 'data-pp-blocktype="copy"' are found, extract from all <p> tags
        if not paragraphs:
            paragraphs = [p.get_text() for p in article_body.find_all('p')]
            if not paragraphs:
                print(f"No valid text found in {link}, skipping.")
                return None

        return '\n'.join(paragraphs)  # Combine all paragraphs into a single string
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {link}: {e}")
        return None

# Save content to a txt file in the specified directory
def save_to_file(content, file_number):
    file_path = os.path.join(txt_files_directory, f'{file_number}.txt')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# The main logic to process the links
links = get_links(csv_file_path)
visited_links = get_visited_links(visited_links_file)

for index, link in enumerate(links):
    if link in visited_links:
        print(f"Skipping {link}, already visited.")
        continue

    content = fetch_and_extract_text(link)
    if content:
        save_to_file(content, index + 1)  # Save as 1.txt, 2.txt, etc.
        add_to_visited(link)  # Mark link as visited
        print(f"Saved content from {link} to {txt_files_directory}/{index + 1}.txt")
