In [2]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from io import BytesIO
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import os
import random
import re
import pandas as pd
import hashlib
from deep_translator import GoogleTranslator
from datetime import datetime
import urllib.parse
from json import dumps, loads
from shutil import copy2

In [5]:
os.chdir('/content/drive/MyDrive/research-similarity/Scraping')

In [9]:
def translate(text):
    return GoogleTranslator(source='ko', target='en').translate(text=text).replace(',', '').replace('\n', '')

def create_directories(base_url, categories, label='images'):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join('output', label, urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [18]:
def get_articles_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    main_content = soup.find('div', id='newsct')
    articles_links = []

    for div in main_content.find_all('div', class_='sa_text'):
        a_tag = div.find('a')
        link = a_tag['href']
        if link not in articles_links:
            articles_links.append(link)

    return articles_links

def article_scrapper(url):
    articles_links = get_articles_links(url)
    data = []

    def helper_scrapper(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')
        try:
            datetime_str = soup.find('span', class_='media_end_head_info_datestamp_time _ARTICLE_DATE_TIME').get('data-date-time')
            time = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
            headline = translate(soup.find('h2').text)
            article_content = soup.find('div', class_='newsct')

            images = []

            for img in article_content.find_all('img'):
                if img.get('id'):
                    img_alt = translate(img.get('alt', ''))  # Safely get the 'alt' attribute
                    img_src = img.get('data-src', '')  # Safely get the 'src' attribute

                    if (img_alt, img_src) not in images:
                        images.append((img_alt, img_src))

            return headline, time, images
        except:
            return None

    for url in articles_links:
        result = helper_scrapper(url)
        if result is not None and len(result[2]):
            data.append(result)

    return data

In [19]:
def get_latest_articles(data, n=10):
    seen_headlines = set()
    unique_data = []

    for record in data:
        headline = record[0]
        if headline not in seen_headlines:
            seen_headlines.add(headline)
            unique_data.append(record)

    return sorted(unique_data, key=lambda x: x[1], reverse=True)[:n]

def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            response = requests.get(img_url)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir, data):

    with open(os.path.join(save_dir, 'labels.csv'), 'w') as f:
        f.write('image number,alt,article_heading\n')

    records = []

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for x, tuple in enumerate(data):
            headline, _, images_list = tuple
            for i, img in enumerate(images_list):
                alt_txt, img_url = img
                if img_url and not img_url.startswith('data:'):
                    img_url = urljoin(category_url, img_url)
                    combined_str = f"{alt_txt}{headline}".encode()
                    img_name = f'image_{x+1}_{i+1}.jpg'
                    records.append(f'{img_name},{alt_txt.replace(",", "")},{headline.replace(",", "")}\n')
                    futures.append(executor.submit(download_image, img_url, save_dir, img_name))

        with open(os.path.join(save_dir, 'labels.csv'), 'a') as f:
            f.writelines(records)

        for future in as_completed(futures):
            future.result()


In [21]:
categories = [
    ("Politics", "https://news.naver.com/section/100"),
    ("Economy", "https://news.naver.com/section/101"),
    ("Society", "https://news.naver.com/section/102"),
    ("Life & Culture", "https://news.naver.com/section/103"),
    ("IT & Science", "https://news.naver.com/section/105"),
    ("World", "https://news.naver.com/section/104"),
]

base_url = 'https://news.naver.com/'

base_dir = create_directories(base_url, categories)

In [22]:
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    try:
        category_dir = os.path.join(base_dir, category)
        data = article_scrapper(category_url)
        download_images(category_url, category_dir, get_latest_articles(data))
    except:
        print(category)
        continue

Downloading images for every category: 100%|██████████| 6/6 [04:42<00:00, 47.13s/it]


In [23]:
def create_file(base_dir):
    for category in os.listdir(base_dir):
        try:
            df = pd.read_csv(f'{base_dir}{category}/labels.csv')
            n, _ = df.shape

            pairs = []

            for i in range(n):
                for j in range(i + 1, n):
                    article_1 = int(re.search(r'\d+(?=_|$)',list(df.iloc[i])[0]).group())
                    article_2 = int(re.search(r'\d+(?=_|$)',list(df.iloc[j])[0]).group())
                    if article_1 != article_2:
                        pairs.append((list(df.iloc[i]), list(df.iloc[j])))

            with open(f'{base_dir.split("/")[2]}_pairs_{category}.csv', 'w') as f:
                for i, pair in enumerate(pairs):
                    p1, p2 = pair
                    img1, alt1, headline1 = p1
                    img2, alt2, headline2 = p2
                    f.write(f'{i+1},{headline1},{headline2}\n')
                    f.write(f',https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/news.naver.com/{urllib.parse.quote(category)}/{img1},https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/news.naver.com/{urllib.parse.quote(category)}/{img2}\n')
                    f.write(f',{alt1},{alt2}\n')

        except:
            continue

In [24]:
create_file('output/images/news.naver.com/')

In [25]:
num_pairs = []

for file in os.listdir():
    if file.startswith('news.naver.com_pairs'):
        try:
            df = pd.read_csv(file)
            num_pairs.append((int(list(df.iloc[-3])[0]), file))
        except:
            continue

In [26]:
sorted_array = sorted(num_pairs, key=lambda x: x[0])
sorted_array

[(126, 'news.naver.com_pairs_IT & Science.csv'),
 (158, 'news.naver.com_pairs_Economy.csv'),
 (170, 'news.naver.com_pairs_Life & Culture.csv'),
 (187, 'news.naver.com_pairs_World.csv'),
 (271, 'news.naver.com_pairs_Society.csv'),
 (424, 'news.naver.com_pairs_Politics.csv')]