In [19]:
import datetime

def get_search_links():
    base_url = "https://www.jhunewsletter.com/search?a=1&o=date"
    subpages = []

    # Get current date
    current_date = datetime.date.today()

    # Start from the year 2009
    year = 2009

    while year <= current_date.year:
        # Set the start and end dates for each search link
        start_date = datetime.date(year, 1, 1)
        end_date = datetime.date(year + 1, 1, 1) if year < current_date.year else current_date

        # Generate the search link using the start and end dates
        search_link = f"{base_url}&ts_month=01&ts_day=1&ts_year={year}&te_month=01&te_day=1&te_year={year+1}"

        # Append the search link to the list
        subpages.append(search_link)

        # Check if there are more than 50 subpages for the current year
        if start_date.year != end_date.year and (end_date - start_date).days > 50:
            # Generate additional links to cover the remaining subpages
            remaining_subpages = (end_date - start_date).days // 50

            for i in range(1, remaining_subpages + 1):
                # Generate the additional search links
                additional_link = f"{search_link}&p={i+1}"
                subpages.append(additional_link)

        year += 1

    return subpages

# Usage
subpages_list = get_search_links()
for link in subpages_list:
    print(link)

import datetime
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

def parse_date(date_text):
    try:
        # Try parsing with "%m/%d/%y %I:%M%p" format
        date = datetime.datetime.strptime(date_text, "%m/%d/%y %I:%M%p")
    except ValueError:
        # Try parsing with "%m/%d/%y" format
        date = datetime.datetime.strptime(date_text, "%m/%d/%y")

    return date.strftime("%Y_%m_%d")

def get_subpage_links(subpages):
    subpage_links = []

    for subpage in subpages:
        response = requests.get(subpage)
        soup = BeautifulSoup(response.content, "html.parser")

        articles = soup.find_all("article", class_="clearfix")
        for article in articles:
            subpage_link = article.find("div", class_="smaller text-break").a["href"]
            subpage_links.append(subpage_link)

    return subpage_links

def get_search_links():
    base_url = "https://www.jhunewsletter.com/search?a=1&o=date"
    subpages = []

    # Get current date
    current_date = datetime.date.today()

    # Start from the year 2009
    year = 2009

    while year <= current_date.year:
        # Set the start and end dates for each search link
        start_date = datetime.date(year, 1, 1)
        end_date = datetime.date(year + 1, 1, 1) if year < current_date.year else current_date

        # Generate the search link using the start and end dates
        search_link = f"{base_url}&ts_month=01&ts_day=1&ts_year={year}&te_month=01&te_day=1&te_year={year+1}"

        # Append the search link to the list
        subpages.append(search_link)

        # Check if there are more than 50 subpages for the current year
        if start_date.year != end_date.year and (end_date - start_date).days > 50:
            # Generate additional links to cover the remaining subpages
            remaining_subpages = (end_date - start_date).days // 50

            for i in range(1, remaining_subpages + 1):
                # Generate the additional search links
                additional_link = f"{search_link}&p={i+1}"
                subpages.append(additional_link)

        year += 1

    return subpages

def get_article_links_and_dates(subpage_links):
    article_links_and_dates = {}

    with tqdm(total=len(subpage_links)) as pbar:
        for subpage_link in subpage_links:
            try:
                response = requests.get(subpage_link)
                response.raise_for_status()  # Raise an exception for invalid response status codes
                soup = BeautifulSoup(response.content, "html.parser")

                articles = soup.find_all("article", class_="clearfix")
                for article in articles:
                    title = article.find("h4").text.strip()
                    link = article.find("a")["href"]
                    date_text = article.find("div", class_="smaller").text.strip().strip("()")
                    date = parse_date(date_text)

                    if date >= "2009_01_01":
                        if date not in article_links_and_dates:
                            article_links_and_dates[date] = []

                        if link not in article_links_and_dates[date]:
                            article_links_and_dates[date].append(link)
                            print(f"Added article with date {date}: {link}")
            except (requests.exceptions.RequestException, requests.exceptions.HTTPError) as e:
                print(f"Skipped invalid URL: {subpage_link}. Error: {str(e)}")

            pbar.update(1)

    return article_links_and_dates

# Usage
subpages_list = get_search_links()
subpage_links = get_subpage_links(subpages_list)
article_links_and_dates_dict = get_article_links_and_dates(subpage_links)

# Save the article links and dates into a JSON file
output_file = "jh_article_pages.json"
with open(output_file, "w") as file:
    json.dump(article_links_and_dates_dict, file, indent=4)

print(f"Article links and dates saved to {output_file}.")

https://www.jhunewsletter.com/search?a=1&o=date&ts_month=01&ts_day=1&ts_year=2009&te_month=01&te_day=1&te_year=2010
https://www.jhunewsletter.com/search?a=1&o=date&ts_month=01&ts_day=1&ts_year=2009&te_month=01&te_day=1&te_year=2010&p=2
https://www.jhunewsletter.com/search?a=1&o=date&ts_month=01&ts_day=1&ts_year=2009&te_month=01&te_day=1&te_year=2010&p=3
https://www.jhunewsletter.com/search?a=1&o=date&ts_month=01&ts_day=1&ts_year=2009&te_month=01&te_day=1&te_year=2010&p=4
https://www.jhunewsletter.com/search?a=1&o=date&ts_month=01&ts_day=1&ts_year=2009&te_month=01&te_day=1&te_year=2010&p=5
https://www.jhunewsletter.com/search?a=1&o=date&ts_month=01&ts_day=1&ts_year=2009&te_month=01&te_day=1&te_year=2010&p=6
https://www.jhunewsletter.com/search?a=1&o=date&ts_month=01&ts_day=1&ts_year=2009&te_month=01&te_day=1&te_year=2010&p=7
https://www.jhunewsletter.com/search?a=1&o=date&ts_month=01&ts_day=1&ts_year=2009&te_month=01&te_day=1&te_year=2010&p=8
https://www.jhunewsletter.com/search?a=1&o=d

 92%|█████████▏| 2083/2260 [07:15<01:05,  2.69it/s]

Skipped invalid URL: . Error: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 93%|█████████▎| 2105/2260 [07:30<01:22,  1.87it/s]

Skipped invalid URL: . Error: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 94%|█████████▍| 2125/2260 [07:33<00:17,  7.74it/s]

Skipped invalid URL: . Error: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 95%|█████████▍| 2145/2260 [07:36<00:13,  8.29it/s]

Skipped invalid URL: . Error: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 96%|█████████▌| 2165/2260 [07:39<00:12,  7.44it/s]

Skipped invalid URL: . Error: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 97%|█████████▋| 2185/2260 [07:42<00:10,  7.17it/s]

Skipped invalid URL: . Error: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 98%|█████████▊| 2205/2260 [07:46<00:07,  7.36it/s]

Skipped invalid URL: . Error: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 98%|█████████▊| 2225/2260 [07:49<00:04,  8.28it/s]

Skipped invalid URL: . Error: Invalid URL '': No scheme supplied. Perhaps you meant https://?


100%|██████████| 2260/2260 [08:14<00:00,  4.57it/s]

Article links and dates saved to jh_article_pages.json.



