In [46]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from io import BytesIO
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import os
import random
import re
import pandas as pd
import hashlib
from datetime import datetime
from json import dumps, loads
from shutil import copy2

In [6]:
def create_directories(base_url, categories, label='images'):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join('output', label, urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [7]:
def get_articles_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    main_tag = soup.find('main', class_='main-content')
    info_headers = main_tag.find_all('header', class_='info-header')
    articles_links = []

    for header in info_headers:
        a_tag = header.find(class_='title').find('a')['href']
        link = a_tag if a_tag.startswith('https') else f'https://www.foxnews.com{a_tag}'
        articles_links.append(link)
    
    return articles_links

def article_scrapper(url):
    articles_links = get_articles_links(url)
    data = []

    def helper_scrapper(url):   
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')
        try:
            time = datetime.strptime(soup.find('time').text.split('EDT')[-2].strip(), "%B %d, %Y %I:%M%p")
            # time = soup.find('time').text.split('EDT')[-2].strip()
            headline = soup.find('h1').text
            content = soup.find('div', class_='article-body')
            images = [(img['alt'], img['src']) for img in content.find_all('img')]

            return headline, time, images
        except:
            return None
        
    for url in articles_links:   
        result = helper_scrapper(url)
        if result is not None:
            data.append(result)
    
    return data

In [85]:
def get_latest_articles(data, n=10):
    seen_headlines = set()
    unique_data = []

    for record in data:
        headline = record[0]
        if headline not in seen_headlines:
            seen_headlines.add(headline)
            unique_data.append(record)

    return sorted(unique_data, key=lambda x: x[1], reverse=True)[:n]

def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            response = requests.get(img_url)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir, data):

    with open(os.path.join(save_dir, 'labels.csv'), 'w') as f:
        f.write('image number,alt,article_heading\n')
    
    records = []

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for x, tuple in enumerate(data):
            headline, _, images_list = tuple
            for i, img in enumerate(images_list):
                alt_txt, img_url = img
                if alt_txt.startswith('Fox News'):
                    continue
                if img_url and not img_url.startswith('data:'):
                    img_url = urljoin(category_url, img_url)
                    combined_str = f"{alt_txt}{headline}".encode()
                    img_name = f'image_{x+1}{i+1}.jpg'
                    records.append(f'{img_name},{alt_txt.replace(",", "")},{headline.replace(",", "")}\n')
                    futures.append(executor.submit(download_image, img_url, save_dir, img_name))
        
        with open(os.path.join(save_dir, 'labels.csv'), 'a') as f:
            f.writelines(records)
            
        for future in as_completed(futures):
            future.result()


In [97]:
categories = [
    ("Business", "https://www.foxnews.com/category/newsedge/business"),
    ("Sports", "https://www.foxnews.com/sports"),
    ("Entertainment", "https://www.foxnews.com/entertainment"),
    ("Science", "https://www.foxnews.com/science"),
    ("World", "https://www.foxnews.com/world")
]


base_url = 'https://www.foxnews.com/'

base_dir = create_directories(base_url, categories)

In [98]:
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    try:
        category_dir = os.path.join(base_dir, category)
        data = article_scrapper(f'{base_url}{category}')
        download_images(category_url, category_dir, get_latest_articles(data))
    except:
        continue

Downloading images for every category: 100%|██████████| 5/5 [01:42<00:00, 20.40s/it]


In [45]:
import csv
import os
from openpyxl import Workbook
from openpyxl.drawing.image import Image
from PIL import Image as PILImage

# Create a new Excel workbook
wb = Workbook()
ws = wb.active

# Open the CSV file
csv_file = 'labels.csv'  # Replace with your CSV file path
image_folder = 'output/images/www.foxnews.com/business/'  # Replace with the folder where images are stored

# Read CSV data and write to Excel
with open(csv_file, newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    header = next(reader)
    ws.append(header)  # Write header to Excel

    for row in reader:
        ws.append(row)  # Write the CSV row to Excel
        
        # The image file name is in the first column
        image_path = os.path.join(image_folder, row[0])
        
        # Check if the image exists
        if os.path.exists(image_path):
            # Resize image to fit within a cell (Optional)
            img = PILImage.open(image_path)
            img.thumbnail((372, 238))  # Resize the image

            resized_image_path = f'temp/resized_{row[0]}'
            img.save(resized_image_path)

            # Insert the image into the Excel sheet
            img_to_insert = Image(resized_image_path)
            ws.add_image(img_to_insert, f'A{ws.max_row}')  # Insert at the current row (A column)
        else:
            print(f"Image not found: {image_path}")

# Save the Excel file
wb.save('output_with_images.xlsx')


In [64]:
df = pd.read_csv('labels.csv')
n, _ = df.shape

pairs = []

for i in range(n):
    for j in range(i + 1, n):
        pairs.append((list(df.iloc[i]), list(df.iloc[j])))

In [93]:
def create_file(base_dir):
    for category in os.listdir(base_dir):
        if len(os.listdir(f'{base_dir}{category}')) <= 1:
            continue
        try:
            df = pd.read_csv(f'{base_dir}{category}/labels.csv')
            n, _ = df.shape

            pairs = []

            for i in range(n):
                for j in range(i + 1, n):
                    pairs.append((list(df.iloc[i]), list(df.iloc[j])))

            with open(f'www.foxnews.com_pairs_{category}.csv', 'w') as f:
                for i, pair in enumerate(pairs):
                    p1, p2 = pair
                    img1, alt1, headline1 = p1
                    img2, alt2, headline2 = p2
                    f.write(f'{i+1},{headline1},{headline2}\n')
                    f.write(f',https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.foxnews.com/{category}/{img1},https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.foxnews.com/{category}/{img2}\n')
                    f.write(f',{alt1},{alt2}\n')
        except:
            continue

In [99]:
create_file('output/images/www.foxnews.com/')

In [100]:
num_pairs = []

for file in os.listdir():
    if file == 'output' or file.startswith('pairs'):
        continue
    try:
        df = pd.read_csv(file)
        num_pairs.append((int(list(df.iloc[-3])[0]), file))
    except:
        continue

In [101]:
sorted_array = sorted(num_pairs, key=lambda x: x[0])
sorted_array

[(276, 'www.foxnews.com_pairs_Executive.csv'),
 (351, 'www.foxnews.com_pairs_Crime.csv'),
 (406, 'www.foxnews.com_pairs_Immigration.csv'),
 (496, 'www.foxnews.com_pairs_Golf.csv'),
 (703, 'www.foxnews.com_pairs_Faith.csv'),
 (703, 'www.foxnews.com_pairs_Education.csv'),
 (703, 'www.foxnews.com_pairs_Disasters.csv'),
 (820, 'www.foxnews.com_pairs_World.csv'),
 (861, 'www.foxnews.com_pairs_Environment.csv'),
 (946, 'www.foxnews.com_pairs_Economy.csv'),
 (990, 'www.foxnews.com_pairs_House.csv'),
 (1378, 'www.foxnews.com_pairs_Business.csv'),
 (2926, 'www.foxnews.com_pairs_Entertainment.csv')]

In [106]:
import shutil
cat = []

for file in os.listdir():
    if file.startswith('www.foxnews.com') and file.endswith('.csv'):
        cat.append(file.replace('.csv', '').split('_')[-1])

for dir in os.listdir('output/images/www.foxnews.com'):
    if dir not in cat:
        shutil.rmtree(f'output/images/www.foxnews.com/{dir}')