In [1]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [2]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from io import BytesIO
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import os
import random
import re
import pandas as pd
import hashlib
from deep_translator import GoogleTranslator
from datetime import datetime
import urllib.parse
from json import dumps, loads
from shutil import copy2

In [3]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/research-similarity/Scraping')

Mounted at /content/drive


In [8]:
def translate(text):
    try:
        return GoogleTranslator(source='es', target='en').translate(text=text).replace(',', '').replace('\n', '')
    except:
        return text

def create_directories(base_url, categories, label='images'):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join('output', label, urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [67]:
def get_articles_links(category):
    with open(f'../links/{category}.json', 'r') as f:
        return loads(f.read())

def article_scrapper(category):
    articles_links = get_articles_links(category)
    data = []

    def helper_scrapper(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')
        try:
            headline = translate(soup.find('h1').text)
            images = []
            datetime_str = soup.find('span', class_='sharebar-article-date').text[:-4].replace(".", "")
            time = datetime.strptime(datetime_str, "%d %b, %Y %I:%M %p")

            for img in soup.find('div', class_='body-article').find_all('img'):
                img_alt = translate(img.get('alt', ''))  # Safely get the 'alt' attribute
                img_src = img.get('src', '')[:img.get('src', '').find('&smart')]  # Safely get the 'src' attribute

                if (img_alt, img_src) not in images:
                    images.append((img_alt, img_src))

            return headline, time, images
        except:
            return None

    for url in articles_links:
        result = helper_scrapper(url)
        if result is not None:
            data.append(result)

    return data

In [33]:
def get_latest_articles(data, n=10):
    seen_headlines = set()
    unique_data = []

    for record in data:
        headline = record[0]
        if headline not in seen_headlines:
            seen_headlines.add(headline)
            unique_data.append(record)

    return sorted(unique_data, key=lambda x: x[1], reverse=True)[:n]

def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(img_url, headers=headers)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir, data):

    with open(os.path.join(save_dir, 'labels.csv'), 'w') as f:
        f.write('image number,alt,article_heading\n')

    records = []

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for x, tuple in enumerate(data):
            headline, _, images_list = tuple
            for i, img in enumerate(images_list):
                alt_txt, img_url = img
                if img_url and not img_url.startswith('data:'):
                    img_url = urljoin(category_url, img_url)
                    combined_str = f"{alt_txt}{headline}".encode()
                    img_name = f'image_{x+1}_{i+1}.jpg'
                    records.append(f'{img_name},{alt_txt.replace(",", "")},{headline.replace(",", "")}\n')
                    futures.append(executor.submit(download_image, img_url, save_dir, img_name))

        with open(os.path.join(save_dir, 'labels.csv'), 'a') as f:
            f.writelines(records)

        for future in as_completed(futures):
            future.result()

In [58]:
categories = [
    ('Fintech World', 'https://www.infobae.com/tag/mundo-fintech/'),
    ('Techno Cars and Mobility', 'https://www.infobae.com/tag/tecno-autos-y-movilidad/'),
    ('Entertainment', 'https://www.infobae.com/entretenimiento/'),
    ('Health', 'https://www.infobae.com/salud/'),
    ('Russia Ukraine War', 'https://www.infobae.com/tag/guerra-rusia-ucrania/'),
    ('Crime and Justice', 'https://www.infobae.com/sociedad/policiales/'),
    ('Society', 'https://www.infobae.com/sociedad/'),
    ('Policy', 'https://www.infobae.com/politica/'),
    ('Music', 'https://www.infobae.com/tag/musica/'),
    ('Arts', 'https://www.infobae.com/tag/arte/'),
    ('Cinema', 'https://www.infobae.com/tag/cine/'),
    ('Series', 'https://www.infobae.com/tag/series/'),
    ('Education', 'https://www.infobae.com/educacion/'),
    ('Tourism', 'https://www.infobae.com/turismo/'),
]

base_url = 'https://www.infobae.com/'

base_dir = create_directories(base_url, categories)

In [68]:
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    try:
        category_dir = os.path.join(base_dir, category)
        data = article_scrapper(category)
        download_images(category_url, category_dir, get_latest_articles(data))
    except:
        print(category)
        continue

Downloading images for every category: 100%|██████████| 14/14 [13:26<00:00, 57.62s/it]


In [78]:
def create_file(base_dir):
    for category in os.listdir(base_dir):
        try:
            df = pd.read_csv(f'{base_dir}{category}/labels.csv')
            n, _ = df.shape

            pairs = []

            for i in range(n):
                for j in range(i + 1, n):
                    article_1 = int(re.search(r'\d+(?=_|$)',list(df.iloc[i])[0]).group())
                    article_2 = int(re.search(r'\d+(?=_|$)',list(df.iloc[j])[0]).group())
                    if article_1 != article_2:
                        pairs.append((list(df.iloc[i]), list(df.iloc[j])))

            with open(f'{base_dir.split("/")[2]}_pairs_{category}.csv', 'w') as f:
                for i, pair in enumerate(pairs):
                    p1, p2 = pair
                    img1, alt1, headline1 = p1
                    img2, alt2, headline2 = p2
                    f.write(f'{i+1},{headline1},{headline2}\n')
                    f.write(f',https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.infobae.com/{urllib.parse.quote(category)}/{img1},https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.infobae.com/{urllib.parse.quote(category)}/{img2}\n')
                    f.write(f',{alt1},{alt2}\n')

        except:
            continue

In [77]:
for category in os.listdir('output/images/www.infobae.com/'):
    df = pd.read_csv(f'output/images/www.infobae.com/{category}/labels.csv')
    n, _ = df.shape

    if n + 1 != len(os.listdir(f'output/images/www.infobae.com/{category}')):
        print(category,n+1, len(os.listdir(f'output/images/www.infobae.com/{category}')))

In [82]:
create_file('output/images/www.infobae.com/')

In [85]:
num_pairs = []

for file in os.listdir():
    if file.startswith('www.infobae.com_pairs'):
        try:
            df = pd.read_csv(file)
            num_pairs.append((int(list(df.iloc[-3])[0]), file))
        except:
            continue

In [86]:
sorted_array = sorted(num_pairs, key=lambda x: x[0])
sorted_array

[(85, 'www.infobae.com_pairs_Cinema.csv'),
 (210, 'www.infobae.com_pairs_Crime and Justice.csv'),
 (258, 'www.infobae.com_pairs_Fintech World.csv'),
 (279, 'www.infobae.com_pairs_Policy.csv'),
 (280, 'www.infobae.com_pairs_Education.csv'),
 (375, 'www.infobae.com_pairs_Techno Cars and Mobility.csv'),
 (376, 'www.infobae.com_pairs_Russia Ukraine War.csv'),
 (397, 'www.infobae.com_pairs_Society.csv'),
 (577, 'www.infobae.com_pairs_Series.csv'),
 (582, 'www.infobae.com_pairs_Arts.csv'),
 (639, 'www.infobae.com_pairs_Health.csv'),
 (993, 'www.infobae.com_pairs_Entertainment.csv'),
 (2463, 'www.infobae.com_pairs_Tourism.csv')]