In [1]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from io import BytesIO
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import os
import random
import re
import pandas as pd
import hashlib
from datetime import datetime
from json import dumps, loads
from shutil import copy2

In [78]:
url = 'https://nypost.com/food-and-drink/'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

In [79]:
# <div class="section__content">
main_content = soup.find('div', class_='section__content')

main_content.find_all('div', class_='story__headline headline headline--archive')

[]

In [80]:
for tag in main_content.find_all('div'):
    class_name = tag.get('class', [])
    if class_name in [['story__text'], ['story__inner']] and tag.find('a'):
        link = tag.find('a')['href']
        print(link)

https://nypost.com/2024/06/12/lifestyle/ny-post-debuts-big-apple-schmear-cream-cheese-with-popup-bagel/
https://nypost.com/2024/09/06/lifestyle/from-flamin-hot-tacos-to-fritos-sundaes-the-weirdest-foods-nfl-stadiums-are-offering-this-season/
https://nypost.com/2024/09/06/lifestyle/mampms-bringing-back-discontinued-flavor-9-years-after-being-stripped-from-shelves-finally/
https://nypost.com/2024/09/06/lifestyle/doctor-who-was-overweight-reveals-5-weight-loss-tips-on-tiktok/
https://nypost.com/2024/09/05/business/red-lobster-to-exit-chapter-11-soon-after-judge-approves-sale-to-new-owner/
https://nypost.com/2024/09/05/lifestyle/certain-alcoholic-drinks-can-worsen-hangxiety-experts/
https://nypost.com/2024/09/05/lifestyle/is-gen-z-killing-the-wine-industry/
https://nypost.com/2024/09/05/lifestyle/chipotle-drops-bizarre-halloween-costumes-for-burritos-lovers/
https://nypost.com/2024/09/05/lifestyle/theres-actually-a-reason-hot-dogs-and-buns-arent-sold-in-the-same-number/
https://nypost.com/

In [73]:
def create_directories(base_url, categories, label='images'):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join('output', label, urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [89]:
def get_articles_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    main_content = soup.find('div', class_='section__content')
    articles_links = []
    
    for tag in main_content.find_all('div'):
        class_name = tag.get('class', [])
        if class_name in [['story__text'], ['story__inner']] and tag.find('a'):
            link = tag.find('a')['href']
            if link not in articles_links:
                articles_links.append(link)

    return articles_links

def article_scrapper(url):
    articles_links = get_articles_links(url)
    data = []

    def helper_scrapper(url):   
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')
        try:
            headline = soup.find('h1').text.replace('\n', '').replace('\t', '')
            datetime_str = soup.find('div', class_='date--updated__item').find_all('span')[-1].text.replace('a.m. ET', 'am').replace('p.m. ET', 'pm')
            time = datetime.strptime(datetime_str, "%b. %d, %Y, %H:%M %p")
            featured_image = soup.find('figure').find('img')
            images = [(featured_image.get('alt'), featured_image.get('src'))]
            related = []

            for div in soup.find('div', class_='single__content entry-content m-bottom').find_all('div'):
                for img in div.find_all('img'):
                    img_alt = img.get('alt', '')  # Safely get the 'alt' attribute
                    img_src = img.get('src', '')  # Safely get the 'src' attribute
                    if  (img_alt, img_src) not in images:
                        images.append((img_alt, img_src))
                    if div.get('class') in [['inline-module__inner']]:
                        related.append((img_alt, img_src))

            return headline, time, list(set(images) - set(related))
        except:
            return None
        
    for url in articles_links:   
        result = helper_scrapper(url)
        if result is not None:
            data.append(result)
    
    return data

In [95]:
def get_latest_articles(data, n=10):
    seen_headlines = set()
    unique_data = []

    for record in data:
        headline = record[0]
        if headline not in seen_headlines:
            seen_headlines.add(headline)
            unique_data.append(record)

    return sorted(unique_data, key=lambda x: x[1], reverse=True)[:n]

def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            response = requests.get(img_url)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir, data):

    with open(os.path.join(save_dir, 'labels.csv'), 'w') as f:
        f.write('image number,alt,article_heading\n')
    
    records = []

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for x, tuple in enumerate(data):
            headline, _, images_list = tuple
            for i, img in enumerate(images_list):
                alt_txt, img_url = img
                if img_url and not img_url.startswith('data:'):
                    img_url = urljoin(category_url, img_url)
                    combined_str = f"{alt_txt}{headline}".encode()
                    img_name = f'image_{x+1}{i+1}.jpg'
                    records.append(f'{img_name},{alt_txt.replace(",", "")},{headline.replace(",", "")}\n')
                    futures.append(executor.submit(download_image, img_url, save_dir, img_name))
        
        with open(os.path.join(save_dir, 'labels.csv'), 'a') as f:
            f.writelines(records)
            
        for future in as_completed(futures):
            future.result()

In [47]:

featured_image = soup.find('figure').find('img')
images = [(featured_image.get('alt'), featured_image.get('src'))]
related = []

for div in soup.find('div', class_='single__content entry-content m-bottom').find_all('div'):
    for img in div.find_all('img'):
        img_alt = img.get('alt', '')  # Safely get the 'alt' attribute
        img_src = img.get('src', '')  # Safely get the 'src' attribute
        if  (img_alt, img_src) not in images:
            images.append((img_alt, img_src))
        if div.get('class') in [['inline-module__inner']]:
            related.append((img_alt, img_src))

In [48]:
images

[('How finance bros scored a luxury penthouse on a starter salary',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/gimme-ph-109-88199944.jpg?quality=75&strip=all&w=744'),
 ('The luxury penthouse comes with lots of common space',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/gimme-ph-109-88199943.jpg?w=1024'),
 ('Stairs lead to a main bedroom suite and a private 900 square foot rooftop terrace',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/gimme-ph-109-88199947.jpg?w=1024'),
 ('Southwest Ranches',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/88869312-1.jpg?quality=75&strip=all&w=171&h=114&crop=1'),
 ('Flower bandit Lewis Miller lists colorful West Palm Beach home for $2.9M',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/88869467.jpg?quality=75&strip=all&w=171&h=114&crop=1'),
 ('',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/newspress-collage-2456zqsns-1725488954145.jpg?quality=75&strip=all&1725474609&w=171&h=114&crop=1

In [49]:
set(images) - set(related)

{('How finance bros scored a luxury penthouse on a starter salary',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/gimme-ph-109-88199944.jpg?quality=75&strip=all&w=744'),
 ('One of three bedrooms in the penthouse',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/gimme-ph-109-88199942.jpg?w=1024'),
 ('Stairs lead to a main bedroom suite and a private 900 square foot rooftop terrace',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/gimme-ph-109-88199947.jpg?w=1024'),
 ('The luxury penthouse comes with lots of common space',
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/gimme-ph-109-88199943.jpg?w=1024'),
 ("The open chef's kitchen",
  'https://nypost.com/wp-content/uploads/sites/2/2024/09/gimme-ph-109-88199946.jpg?w=1024')}

In [69]:
headline = soup.find('h1').text.replace('\n', '').replace('\t', '')
datetime_str = soup.find('div', class_='date--updated__item').find_all('span')[-1].text.replace('a.m. ET', 'am').replace('p.m. ET', 'pm')
time = datetime.strptime(datetime_str, "%b. %d, %Y, %H:%M %p")
print(time)

2024-09-06 10:08:00


In [63]:
datetime_str = "Sep. 6, 2024, 10:08 am"
datetime_obj = datetime.strptime(datetime_str, "%b. %d, %Y, %H:%M %p")

print(datetime_obj)

2024-09-06 10:08:00


In [96]:
categories = [
    ("Food & Drink", "https://nypost.com/food-and-drink/"),
    ("Tech", "https://nypost.com/tech/"),
    ("Movies", "https://nypost.com/movies/"),
    ("Real Estate", "https://nypost.com/real-estate/"),
    ("MLB", "https://nypost.com/mlb/")
]

base_url = 'https://www.nypost.com/'

base_dir = create_directories(base_url, categories)

In [97]:
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    category_dir = os.path.join(base_dir, category)
    data = article_scrapper(category_url)
    download_images(category_url, category_dir, get_latest_articles(data))

Downloading images for every category: 100%|██████████| 5/5 [02:18<00:00, 27.63s/it]


In [None]:
def create_file(base_dir):
    for category in os.listdir(base_dir):
        df = pd.read_csv(f'{base_dir}{category}/labels.csv')
        n, _ = df.shape

        pairs = []

        for i in range(n):
            for j in range(i + 1, n):
                pairs.append((list(df.iloc[i]), list(df.iloc[j])))

        with open(f'pairs_{category}.csv', 'w') as f:
            for i, pair in enumerate(pairs):
                p1, p2 = pair
                img1, alt1, headline1 = p1
                img2, alt2, headline2 = p2
                f.write(f'{i+1},{headline1},{headline2}\n')
                f.write(f',https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.foxnews.com/{category}/{img1},https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.foxnews.com/{category}/{img2}\n')
                f.write(f',{alt1},{alt2}\n')