In [None]:
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
from docx import Document

# Set up the Selenium WebDriver (with ChromeDriver)
options = Options()
options.add_argument("--headless")  # Run headless (no UI)

# This automatically installs the latest version of ChromeDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Go to the URL you want to scrape
url = "https://medium.com/tag/2023/archive/2023"
driver.get(url)

# Wait for the page to load (using WebDriverWait instead of time.sleep)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, 'article')))

# Scroll down multiple times to load more content
scroll_pause_time = 10  # Pause time between scrolls
scroll_height = driver.execute_script("return document.body.scrollHeight")
max_scrolls = 50  # Set a maximum number of scrolls
scroll_count = 0  # Counter to track the number of scrolls

# Create a directory for the documents
output_folder = "scraped_articles"
os.makedirs(output_folder, exist_ok=True)

while scroll_count < max_scrolls:  # Limit the number of scrolls
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(scroll_pause_time)
    
    # Calculate new scroll height after scrolling
    new_scroll_height = driver.execute_script("return document.body.scrollHeight")
    
    if new_scroll_height == scroll_height:
        break  # Exit loop if no more content is loaded
    
    scroll_height = new_scroll_height
    scroll_count += 1  # Increment the scroll counter

html_content = driver.page_source
soup = BeautifulSoup(html_content, "lxml")

# Find all article links
links = []
for div in soup.find_all('div', {'data-href': True}):  # Looking for the 'data-href' attribute
    link = div['data-href']
    links.append(link)

print(f"Found {len(links)} articles.")

# Close the browser
driver.quit()

# Loop through each article link and extract the content
for link in links:
    try:
        # Send request to the article link
        response = requests.get(link)
        response.raise_for_status()  # Ensure we got a successful response
        article_soup = BeautifulSoup(response.text, "lxml")
        
        # Find the article content
        article = article_soup.find('article')
        if article:
            doc = Document()  # Create a new Document for each article page
            
            # Find all <p> tags within the article
            paragraphs = article.find_all('p')
            for para in paragraphs:
                para_text = para.get_text(strip=True) 
                
                if para_text:  # Only add non-empty paragraphs
                    doc.add_paragraph(para_text)
            
            # Extract the article title from the URL or metadata
            parsed_url = urlparse(link)
            file_name = os.path.basename(parsed_url.path).replace("-", "_")
            word_file_name = os.path.join(output_folder, f"{file_name}.docx")
            
            # Save the document
            doc.save(word_file_name)
            print(f"Saved: {word_file_name}")
        
    except Exception as e:
        print(f"Failed to process {link}. Error: {e}")


Found 510 articles.
Saved: scraped_articles\2023_um_ano_de_resilência_e_o_traço_da_eternidade_2023_a_year_of_resilience_and_the_trace_of_86e13e106ef5.docx
Saved: scraped_articles\2023_a_pivotal_year_in_the_evolution_of_ai_ea4f019e386d.docx
Saved: scraped_articles\goodbye_2023_dc426383908c.docx
Saved: scraped_articles\little_more_some_more_and_it_spilled_cb149a397218.docx
Saved: scraped_articles\my_2023_verklempt_divided_5eb02bd0336e.docx
Saved: scraped_articles\observations_from_studying_my_successful_and_unsuccessful_articles_of_2023_3c799190b6f6.docx
Saved: scraped_articles\goodbye_2023_hello_2024_3a3e1f3d62c3.docx
Saved: scraped_articles\a_final_fuck_you_719a6c4bccbd.docx
Saved: scraped_articles\2023_mph_10da6904c629.docx
Saved: scraped_articles\setting_up_for_leap_ba9bf6315c40.docx
Saved: scraped_articles\whatever_has_a_beginning_surely_has_an_end_d8ec5995ff99.docx
Saved: scraped_articles\embers_bce6f5f3b068.docx
Saved: scraped_articles\5_lessons_im_taking_out_of_2023_27df53d61728.

Saved: scraped_articles\2023_uncaging_the_birds_9afe88ccff9a.docx
Saved: scraped_articles\reflecting_on_2023_a_year_of_growth_and_resilience_c98f8e1f7a1c.docx
Saved: scraped_articles\2023_is_incredible_06a9ccd3e6ff.docx
Saved: scraped_articles\2023_회고_9e36db8aaecf.docx
Saved: scraped_articles\how_they_made_us_feel_the_best_songs_of_2023_702573355661.docx
Saved: scraped_articles\2023_a_year_of_words_wins_and_spiritual_resonance_b75958ef730f.docx
Saved: scraped_articles\seven_epiphanies_lessons_2023_in_review_e8e3e89a7b48.docx
Saved: scraped_articles\writing_in_2023_806c978a75e5.docx
Saved: scraped_articles\doei_doei_2023_bd110ea83d8d.docx
Saved: scraped_articles\my_app_discoveries_in_2023_d9279a078c1f.docx
Saved: scraped_articles\on_the_cusp_0d5265eeb9d9.docx
Saved: scraped_articles\2023_its_time_to_get_off_the_dance_floor_2f0af1141c6f.docx
Saved: scraped_articles\goodbye_to_all_breakups_in_2023_dbc2e293432a.docx
Saved: scraped_articles\2023_개발자_회고_b73cf93d443d.docx
Saved: scraped_artic

Saved: scraped_articles\5_best_films_i_saw_in_2023_ca8aa8bbc271.docx
Saved: scraped_articles\a_wrap_up_for_2023_0305e3ebe6a2.docx
Saved: scraped_articles\2023_年度總結_自由和控制值得一切_6726c0b5b177.docx
Saved: scraped_articles\2023_a_full_year_bdece7c38deb.docx
Saved: scraped_articles\개발자_2023_기술회고를_하다_66028d5359bd.docx
Saved: scraped_articles\2023_the_privilege_called_hindsight_2f613d3125cf.docx
Saved: scraped_articles\2023_a_review_of_the_year_i_explored_77166e0cd7d7.docx
Saved: scraped_articles\navigating_the_ai_and_ml_labyrinth_my_2023_journey_a7024212ed78.docx
Saved: scraped_articles\2023_a_year_to_remember_6d60f7f75439.docx
Saved: scraped_articles\post_it_note_wisdom_d7b0c4ce88c7.docx
Saved: scraped_articles\most_popular_business_trends_in_2023_956397e99203.docx
Saved: scraped_articles\on_the_brink_of_the_1_000_followers_can_i_achieve_this_before_the_year_2024_begins_efea86497a0e.docx
Saved: scraped_articles\my_2023_recap_embracing_growth_and_transformation_14141cfc398e.docx
Saved: scraped_

Saved: scraped_articles\jcs_top_albums_of_2023_804533383f06.docx
Saved: scraped_articles\2023_my_year_in_unconventional_categories_e27a8cf349e9.docx
Saved: scraped_articles\let_me_close_the_last_chapter_of_2023_ef534c38f166.docx
Saved: scraped_articles\prime_protocols_2023_year_in_review_623966f17507.docx
Saved: scraped_articles\yin_review_2023_affa4145853b.docx
Saved: scraped_articles\the_best_books_i_read_in_2023_fc532fa825f5.docx
Saved: scraped_articles\5_things_i_learned_in_2023_e7d4d8fb21ff.docx
Saved: scraped_articles\the_best_albums_and_eps_of_2023_c6f204557524.docx
Saved: scraped_articles\2023_expanding_our_evolution_014ca8abab4c.docx
Saved: scraped_articles\the_23_of_23_659d4735d1b8.docx
Saved: scraped_articles\my_top_5_articles_of_2023_d29ac218d69a.docx
Saved: scraped_articles\no_one_knows_anything_my_surname_is_no_one_and_i_want_dangerously_peppered_pork_for_the_new_year_e93f9914ee18.docx
Saved: scraped_articles\a_very_good_year_131afb1db443.docx
Saved: scraped_articles\out_

Saved: scraped_articles\farewell_2023_my_journey_to_publishing_a_childrens_book_for_my_nephew_cdedfb5e54fb.docx
Saved: scraped_articles\2023_journey_to_serenity_dance_with_serendipity_quest_for_clarity_dcd420571976.docx
Saved: scraped_articles\how_im_spending_the_end_of_2023_a46647458082.docx
Saved: scraped_articles\2023_year_in_review_five_years_of_ongoing_momentum_in_these_challenging_times_0700272e8730.docx
Saved: scraped_articles\my_2024_resolutions_wishlist_for_my_data_analytics_journey_bd7e0b5162e8.docx
Saved: scraped_articles\everybody_knows_this_is_nowhere_4a22b9cefed8.docx
Saved: scraped_articles\adblocks_2023_accomplishments_83f755eaf8e6.docx
Saved: scraped_articles\the_3_most_life_changing_lessons_i_learned_in_2023_e1f6e7bd8cc3.docx
Saved: scraped_articles\reflections_and_going_forward_a2e57cf10a3d.docx
Saved: scraped_articles\the_hottest_open_source_projects_of_2023_876ef479ff55.docx
Saved: scraped_articles\brú_finance_2023_in_review_93911e43f930.docx
Saved: scraped_article