In [1]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from io import BytesIO
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import os
import random
import re
import pandas as pd
import hashlib
from deep_translator import GoogleTranslator
from datetime import datetime
import urllib.parse
from json import dumps, loads
from shutil import copy2

In [2]:
def translate(text):
    return GoogleTranslator(source='hi', target='en').translate(text=text).replace(',', '').replace('\n', '')

def create_directories(base_url, categories, label='images'):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join('output', label, urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [27]:
url = 'https://www.livehindustan.com/international/news'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')

In [108]:
def get_articles_links(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    main_content = soup.find('div', id='listing')
    if main_content.find_all('a') is None:
        main_content = soup.find('section', class_='main-wdgt listing article')
    
    articles_links = []

    for a_tag in main_content.find_all('a'):
        link = f"https://www.livehindustan.com{a_tag['href']}" if not a_tag['href'].startswith('https') else a_tag['href']
        if link not in articles_links:
            articles_links.append(link)

    return articles_links

def article_scrapper(url):
    articles_links = get_articles_links(url)
    data = []

    def helper_scrapper(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        try:
            headline = translate(soup.find('h1').text)
            content = str(soup)

            # Split the content into lines
            lines = content.split('\n')

            for x, line in enumerate(lines):
                if '"datePublished"' in line:  # Check if the substring is in the line
                    datetime_str = line.replace('"datePublished": "', '').strip()[:-8]
                    time = datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%S")
                if line.strip().startswith('"image":'):
                    start = x
                if line.strip().startswith('"author":'):
                    end = x

            for i in range(start+1, end):
                if lines[i].strip().startswith('"url"'):
                    url = lines[i].replace('"url":', '').replace('"', '').replace(',', '').strip()
                if lines[i].strip().startswith('"caption"'):
                    alt = translate(lines[i].replace('"caption":', '').replace('"', '').replace(',', '').strip())

            return headline, time, [(alt, url)]
        except:
            return None

    for url in articles_links:
        result = helper_scrapper(url)
        if result is not None and len(result[2]):
            data.append(result)

    return data

In [109]:
def get_latest_articles(data, n=10):
    seen_headlines = set()
    unique_data = []

    for record in data:
        headline = record[0]
        if headline not in seen_headlines:
            seen_headlines.add(headline)
            unique_data.append(record)

    return sorted(unique_data, key=lambda x: x[1], reverse=True)[:n]

def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(img_url, headers=headers)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir, data):

    with open(os.path.join(save_dir, 'labels.csv'), 'w') as f:
        f.write('image number,alt,article_heading\n')

    records = []

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for x, tuple in enumerate(data):
            headline, _, images_list = tuple
            for i, img in enumerate(images_list):
                alt_txt, img_url = img
                if img_url and not img_url.startswith('data:'):
                    img_url = urljoin(category_url, img_url)
                    combined_str = f"{alt_txt}{headline}".encode()
                    img_name = f'image_{x+1}_{i+1}.jpg'
                    records.append(f'{img_name},{alt_txt.replace(",", "")},{headline.replace(",", "")}\n')
                    futures.append(executor.submit(download_image, img_url, save_dir, img_name))

        with open(os.path.join(save_dir, 'labels.csv'), 'a') as f:
            f.writelines(records)

        for future in as_completed(futures):
            future.result()

In [None]:
categories = [
    ('Share Market', 'https://www.livehindustan.com/business/share-market/news'),
    ('Cricket', 'https://www.livehindustan.com/cricket/news'),
    ('Bollywood', 'https://www.livehindustan.com/entertainment/bollywood/news'),
    ('TV', 'https://www.livehindustan.com/entertainment/tv/news'),
    ('Web Series', 'https://www.livehindustan.com/entertainment/web-series/news'),
    ('Movie Review', 'https://www.livehindustan.com/entertainment/film-review/news'),
    ('Business', 'https://www.livehindustan.com/business/news'),
    ('Personal Investment', 'https://www.livehindustan.com/business/personal-investments/news'),
    ('Share Market', 'https://www.livehindustan.com/business/share-market/news'),
    ('International', 'https://www.livehindustan.com/international/news'),
    ('Spirtuality', 'https://www.livehindustan.com/astrology/spiritual/news'),
    ('Discourse', 'https://www.livehindustan.com/astrology/discourse/news'),
    ('Health', 'https://www.livehindustan.com/lifestyle/health/news'),
    ('Lifestyle', 'https://www.livehindustan.com/lifestyle/news'),
    ('Fitness', 'https://www.livehindustan.com/lifestyle/fitness/news'),
    ('Beauty', 'https://www.livehindustan.com/lifestyle/beauty/news'),
    ('Food', 'https://www.livehindustan.com/lifestyle/food/news'),
    ('Fashion', 'https://www.livehindustan.com/lifestyle/fashion/news'),
    ('Travel', 'https://www.livehindustan.com/lifestyle/travel/news'),
    ('Relationship', 'https://www.livehindustan.com/lifestyle/relationship/news'),
    ('Bike', 'https://www.livehindustan.com/auto/bikes/news'),
    ('Car', 'https://www.livehindustan.com/auto/cars/news'),
    ('Gadgets', 'https://www.livehindustan.com/gadgets/news'),
    ('Apps', 'https://www.livehindustan.com/gadgets/apps/news')
]

base_url = 'https://www.livehindustan.com/'

base_dir = create_directories(base_url, categories)

In [104]:
url = 'https://www.livehindustan.com/viral-news/tanker-sunk-into-road-pit-in-pune-seen-diving-201726848863356.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')

In [45]:
headline = translate(soup.find('h1').text)

In [48]:
headline

'A tanker got stuck in a pothole in Pune and drowned in the drain water; residents said- what about our safety'

In [105]:
content = str(soup)

# Split the content into lines
lines = content.split('\n')

for x, line in enumerate(lines):
    if '"datePublished"' in line:  # Check if the substring is in the line
        datetime_str = line.replace('"datePublished": "', '').strip()[:-8]
        time = datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%S")
    if line.strip().startswith('"image":'):
        start = x
    if line.strip().startswith('"author":'):
        end = x

for i in range(start+1, end):
    if lines[i].strip().startswith('"url"'):
        url = lines[i].replace('"url":', '').replace('"', '').replace(',', '').strip()
    if lines[i].strip().startswith('"caption"'):
        alt = translate(lines[i].replace('"caption":', '').replace('"', '').replace(',', '').strip())

2024-09-20 21:48:58


In [None]:
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    category_dir = os.path.join(base_dir, category)
    try:
        data = article_scrapper(category_url)
        download_images(category_url, category_dir, get_latest_articles(data))
    except:
        print(category)
        continue

In [None]:
def create_file(base_dir):
    for category in os.listdir(base_dir):
        try:
            df = pd.read_csv(f'{base_dir}{category}/labels.csv')
            n, _ = df.shape

            pairs = []

            for i in range(n):
                for j in range(i + 1, n):
                    article_1 = int(re.search(r'\d+(?=_|$)',list(df.iloc[i])[0]).group())
                    article_2 = int(re.search(r'\d+(?=_|$)',list(df.iloc[j])[0]).group())
                    if article_1 != article_2:
                        pairs.append((list(df.iloc[i]), list(df.iloc[j])))

            with open(f'{base_dir.split("/")[2]}_pairs_{category}.csv', 'w') as f:
                for i, pair in enumerate(pairs):
                    p1, p2 = pair
                    img1, alt1, headline1 = p1
                    img2, alt2, headline2 = p2
                    f.write(f'{i+1},{headline1},{headline2}\n')
                    f.write(f',https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.livehindustan.com/{urllib.parse.quote(category)}/{img1},https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.livehindustan.com/{urllib.parse.quote(category)}/{img2}\n')
                    f.write(f',{alt1},{alt2}\n')

        except:
            continue

In [None]:
create_file('output/images/www.livehindustan.com/')

In [None]:
num_pairs = []

for file in os.listdir():
    if file.startswith('www.livehindustan.com_pairs'):
        try:
            df = pd.read_csv(file)
            num_pairs.append((int(list(df.iloc[-3])[0]), file))
        except:
            continue

In [None]:
sorted_array = sorted(num_pairs, key=lambda x: x[0])
sorted_array