In [1]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from io import BytesIO
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import os
import random
import re
import pandas as pd
import hashlib
from datetime import datetime
import urllib.parse
from json import dumps, loads
from shutil import copy2

In [54]:
url = 'https://www.newsweek.com/sports/nfl'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.content, 'lxml')

In [55]:
for article_card in soup.find_all('article'):
    if article_card.find('h2'):
        print(article_card.find('h2').find('a')['href'])

In [51]:
soup.find_all('article')[-5]

<article class="l3"><div class="image">
<picture class="mapping-small lazysize" height="311" width="466">
<source data-srcset="https://d.newsweek.com/en/full/2471474/jd-vance.webp?w=466&amp;h=311&amp;f=924697145e3a586e8381b73daa8abafd" type="image/webp"><img alt="" class="mapping-small lazysize lazyload" data-src="https://d.newsweek.com/en/full/2471474/jd-vance.jpg?w=466&amp;h=311&amp;f=924697145e3a586e8381b73daa8abafd" height="311" width="466"/>
</source></picture>
<a class="zero" href="/jd-vance-hijack-maga-theocracy-post-liberalists-sofia-nelson-1953866">JD Vance working to "hijack" MAGA to push theocracy, ex-friend warns</a></div><div class="inner"><div class="category">
<a href="/us">U.S.</a></div><h2 class="h3"><a href="/jd-vance-hijack-maga-theocracy-post-liberalists-sofia-nelson-1953866">JD Vance working to "hijack" MAGA to push theocracy, ex-friend warns</a></h2><div class="summary">Sofia Nelson wrote in an op-ed that post-liberalists like Vance want to position themselves wit

In [40]:
def create_directories(base_url, categories, label='images'):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join('output', label, urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [73]:
def get_articles_links(url):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    articles_links = []

    for article_card in soup.find_all('article'):
        if article_card.find('h2'):
            link = f"https://newsweek.com{article_card.find('h2').find('a')['href']}"
            if link not in articles_links:
                articles_links.append(link)
        if article_card.find('h3'):
            link = f"https://newsweek.com{article_card.find('h3').find('a')['href']}"
            if link not in articles_links:
                articles_links.append(link)

    return articles_links

def article_scrapper(url):
    articles_links = get_articles_links(url)
    data = []

    def helper_scrapper(url):
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        try:
            headline = soup.find('h1').text
            datetime_str = soup.find('time', class_='article-time').text.replace('Published ', '').replace('EDT', '').strip()
            time = datetime.strptime(datetime_str, '%b %d, %Y at %I:%M %p')
            article_content = soup.find_all('div', class_='content')

            images = []

            for div in soup.find_all('div', class_='content'):
                if div.find('article'):
                    for img in div.find('article').find_all('img'):
                        img_alt = img.get('alt', '')  # Safely get the 'alt' attribute
                        img_src = img.get('src', '')  # Safely get the 'src' attribute

                        if (img_alt, img_src) not in images and not img_src.endswith('svg'):
                            images.append((img_alt, img_src))
                    break

            return headline, time, images
        except:
            return None

    for url in articles_links:
        result = helper_scrapper(url)
        if result is not None:
            data.append(result)

    return data

In [39]:
def get_latest_articles(data, n=10):
    seen_headlines = set()
    unique_data = []

    for record in data:
        headline = record[0]
        if headline not in seen_headlines:
            seen_headlines.add(headline)
            unique_data.append(record)

    return sorted(unique_data, key=lambda x: x[1], reverse=True)[:n]

def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(img_url, headers=headers)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir, data):

    with open(os.path.join(save_dir, 'labels.csv'), 'w') as f:
        f.write('image number,alt,article_heading\n')

    records = []

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for x, tuple in enumerate(data):
            headline, _, images_list = tuple
            for i, img in enumerate(images_list):
                alt_txt, img_url = img
                if img_url and not img_url.startswith('data:'):
                    img_url = urljoin(category_url, img_url)
                    combined_str = f"{alt_txt}{headline}".encode()
                    img_name = f'image_{x+1}_{i+1}.jpg'
                    records.append(f'{img_name},{alt_txt.replace(",", "")},{headline.replace(",", "")}\n')
                    futures.append(executor.submit(download_image, img_url, save_dir, img_name))

        with open(os.path.join(save_dir, 'labels.csv'), 'a') as f:
            f.writelines(records)

        for future in as_completed(futures):
            future.result()

In [60]:
url = 'https://www.newsweek.com/childhood-poverty-increase-rising-wages-1952044'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.content, 'lxml')

In [32]:
headline = soup.find('h1').text
datetime_str = soup.find('time', class_='article-time').text.replace('Published ', '').replace('EDT', '').strip()
time = datetime.strptime(datetime_str, '%b %d, %Y at %I:%M %p')

In [67]:
for div in soup.find_all('div', class_='content'):
    if div.find('article'):
        print(div.find_all('img'))

[<img alt="Child" class="mapping-embed imgPhoto" height="800" id="i2469040" loading="lazy" src="https://d.newsweek.com/en/full/2469040/child.jpg?w=1200&amp;f=15fc08e00564ef4a5f86e238eb78d4f8" width="1200"/>, <img alt="" src="https://g.newsweek.com/www/images/NW_ICON_CommonGround.svg"/>]


In [41]:
categories = [
    ('U.S. Politics', 'https://www.newsweek.com/topic/u.s.-politics'),
    ('Immigration', 'https://www.newsweek.com/topic/immigration'),
    # ('Crime', 'https://www.newsweek.com/topic/crime'),
    # ('2024 Election', 'https://www.newsweek.com/topic/2024-election'),
    # ('Russia-Ukraine War', 'https://www.newsweek.com/topic/russia-ukraine-war'),
    # ('Israeli-Palestinian Conflict', 'https://www.newsweek.com/topic/israeli-palestinian-conflict'),
    # ('China', 'https://www.newsweek.com/topic/china'),
    # ('North Korea', 'https://www.newsweek.com/topic/north-korea'),
    # ('Animals', 'https://www.newsweek.com/topic/animals'),
    # ('Space', 'https://www.newsweek.com/topic/space'),
    # ('Archaeology', 'https://www.newsweek.com/topic/archaeology'),
    # ('Lakes', 'https://www.newsweek.com/topic/lakes'),
    # ('Mpox', 'https://www.newsweek.com/topic/mpox'),
    # ('Medicare', 'https://www.newsweek.com/topic/medicare'),
    # ('Coronavirus', 'https://www.newsweek.com/topic/coronavirus'),
    # ('Family', 'https://www.newsweek.com/topic/family'),
    # ('Animals', 'https://www.newsweek.com/topic/animals'),
    # ('Parenting', 'https://www.newsweek.com/topic/parenting'),
    # ('Relationship', 'https://www.newsweek.com/topic/relationship'),
    # ('Air Travel', 'https://www.newsweek.com/topic/air-travel'),
    # ('Movies', 'https://www.newsweek.com/topic/movies'),
    # ('TV', 'https://www.newsweek.com/topic/tv'),
    # ('Royals', 'https://www.newsweek.com/topic/royal-family'),
    # ('Music', 'https://www.newsweek.com/topic/music'),
    # ('College', 'https://www.newsweek.com/topic/college'),
    # ('NFL', 'https://www.newsweek.com/sports/nfl'),
    # ('NBA', 'https://www.newsweek.com/sports/nba'),
    # ('MLB', 'https://www.newsweek.com/sports/mlb'),
    # ('Racing', 'https://www.newsweek.com/sports/racing'),
    # ('Golf', 'https://www.newsweek.com/sports/golf'),
    # ('Basketball', 'https://www.newsweek.com/topic/basketball'),
    # ('WWE', 'https://www.newsweek.com/topic/wwe'),
    # ('SUV', 'https://www.newsweek.com/topic/suv')
    # ('Gender', 'https://www.newsweek.com/topic/gender'),
    # ('Autism', 'https://www.newsweek.com/topic/autism'),
    # ('Schools', 'https://www.newsweek.com/topic/schools')
]

base_url = 'https://www.newsweek.com/'

base_dir = create_directories(base_url, categories)

In [74]:
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    try:
        category_dir = os.path.join(base_dir, category)
        data = article_scrapper(category_url)
        print(data)
        download_images(category_url, category_dir, get_latest_articles(data))
    except:
        print(category)
        continue

Downloading images for every category:  50%|█████     | 1/2 [00:53<00:53, 53.29s/it]



Downloading images for every category: 100%|██████████| 2/2 [01:48<00:00, 54.37s/it]

[("Donald Trump Vows To Terminate Harris' 'Phone App for Smuggling' Migrants", datetime.datetime(2024, 9, 15, 11, 17), [('Trump immigration', 'https://d.newsweek.com/en/full/2471640/trump-immigration.jpg?w=1200&f=7f74a71532cd8343ef2fefcda5000ade')]), ('Greg Abbott Defies Joe Biden With Pledge To Triple Razor Wire Defenses', datetime.datetime(2024, 9, 15, 10, 17), [(' Gov. Greg Abbott', 'https://d.newsweek.com/en/full/2471618/gov-greg-abbott.jpg?w=1200&f=f56024b463ca384309b8bc925ef32938')]), ("Woman Behind Springfield Haitian Immigrants 'Eating Pets' Rumor Speaks Out", datetime.datetime(2024, 9, 14, 11, 20), [('Springfield Police bomb threat', 'https://d.newsweek.com/en/full/2471409/springfield-police-bomb-threat.jpg?w=1200&f=5349060223f5eb9648416b929d454462')]), ('JD Vance Says Springfield Row Reveals Why He Flipped Script On Trump', datetime.datetime(2024, 9, 14, 10, 7), [('Senator J.D. Vance', 'https://d.newsweek.com/en/full/2471403/senator-jd-vance.jpg?w=1200&f=f310ad5478b7188195ffe


