In [None]:
!pip install deep_translator

In [None]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from io import BytesIO
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import os
import random
import re
import pandas as pd
import hashlib
from deep_translator import GoogleTranslator
from datetime import datetime
import urllib.parse
from json import dumps, loads
from shutil import copy2

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/research-similarity/Scraping')

Mounted at /content/drive


In [10]:
def translate(text):
    try:
        return GoogleTranslator(source='hi', target='en').translate(text=text).replace(',', '').replace('\n', '')
    except:
        return text

def create_directories(base_url, categories, label='images'):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join('output', label, urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [18]:
def get_articles_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    main_content = soup.find('div', class_='section-listing-LHS')
    articles_links = []

    for a_tag in main_content.find_all('a'):
        link = a_tag['href']
        if link.startswith('https') and 'video' not in link.split('/') and 'photo' not in link.split('/') and link not in articles_links:
            articles_links.append(link)

    return articles_links

def article_scrapper(url):
    articles_links = get_articles_links(url)
    data = []

    def helper_scrapper(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')
        try:
            headline = translate(soup.find('h1').text)
            featured_img = soup.find('div', class_='main-img').find('img')
            images = [(featured_img.get('alt'), translate(featured_img.get('data-src')))]
            datetime_str = translate(soup.find('div', class_="brand-detial-main").find_all('li')[-1].text).replace('(Updated ', '').replace(' IST)', '')
            time = datetime.strptime(datetime_str, "%B %d %Y %I:%M %p")

            return headline, time, images
        except:
            return None

    for url in articles_links:
        result = helper_scrapper(url)
        if result is not None:
            data.append(result)

    return data

In [None]:
def get_latest_articles(data, n=10):
    seen_headlines = set()
    unique_data = []

    for record in data:
        headline = record[0]
        if headline not in seen_headlines:
            seen_headlines.add(headline)
            unique_data.append(record)

    return sorted(unique_data, key=lambda x: x[1], reverse=True)[:n]

def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            response = requests.get(img_url)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir, data):

    with open(os.path.join(save_dir, 'labels.csv'), 'w') as f:
        f.write('image number,alt,article_heading\n')

    records = []

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for x, tuple in enumerate(data):
            headline, _, images_list = tuple
            for i, img in enumerate(images_list):
                alt_txt, img_url = img
                if img_url and not img_url.startswith('data:'):
                    img_url = urljoin(category_url, img_url)
                    combined_str = f"{alt_txt}{headline}".encode()
                    img_name = f'image_{x+1}_{i+1}.jpg'
                    records.append(f'{img_name},{alt_txt.replace(",", "")},{headline.replace(",", "")}\n')
                    futures.append(executor.submit(download_image, img_url, save_dir, img_name))

        with open(os.path.join(save_dir, 'labels.csv'), 'a') as f:
            f.writelines(records)

        for future in as_completed(futures):
            future.result()

In [19]:
categories = [
    ('Bollywood', 'https://www.aajtak.in/entertainment/bollywood-news'),
    ('Relationship', 'https://www.aajtak.in/lifestyle/relationship'),
    ('Lifestyle News', 'https://www.aajtak.in/lifestyle/news'),
    ('Fashion', 'https://www.aajtak.in/lifestyle/fashion'),
    ('Tour and Tourism', 'https://www.aajtak.in/lifestyle/tourism'),
    ('Food', 'https://www.aajtak.in/lifestyle/food'),
    ('Movie Reviews', 'https://www.aajtak.in/entertainment/film-review'),
    ('Hollywood', 'https://www.aajtak.in/entertainment/hollywood'),
    ('Festivals', 'https://www.aajtak.in/religion/festivals'),
    ('Spirtuality', 'https://www.aajtak.in/religion/spirituality'),
    ('Religion', 'https://www.aajtak.in/religion/news'),
    ('Tech News', 'https://www.aajtak.in/technology/tech-news'),
    ('Science', 'https://www.aajtak.in/science'),
    ('Crime', 'https://www.aajtak.in/crime/news'),
    ('Police and Intelligence', 'https://www.aajtak.in/crime/police-and-intelligence'),
    ('Cyber Crime', 'https://www.aajtak.in/crime/cyber-crime'),
    ('Cricket', 'https://www.aajtak.in/sports/cricket'),
    ('Football', 'https://www.aajtak.in/sports/football'),
    ('Tennis', 'https://www.aajtak.in/sports/tennis')
]

base_url = 'https://www.aajtak.in/'

base_dir = create_directories(base_url, categories)

In [20]:
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    category_dir = os.path.join(base_dir, category)
    try:
        data = article_scrapper(category_url)
        download_images(category_url, category_dir, get_latest_articles(data))
    except:
        print(category)
        continue

Downloading images for every category: 100%|██████████| 19/19 [05:24<00:00, 17.10s/it]


In [28]:
def create_file(base_dir):
    for category in os.listdir(base_dir):
        try:
            df = pd.read_csv(f'{base_dir}{category}/labels.csv')
            n, _ = df.shape

            pairs = []

            for i in range(n):
                for j in range(i + 1, n):
                    article_1 = int(re.search(r'\d+(?=_|$)',list(df.iloc[i])[0]).group())
                    article_2 = int(re.search(r'\d+(?=_|$)',list(df.iloc[j])[0]).group())
                    if article_1 != article_2:
                        pairs.append((list(df.iloc[i]), list(df.iloc[j])))

            with open(f'{base_dir.split("/")[2]}_pairs_{category}.csv', 'w') as f:
                for i, pair in enumerate(pairs):
                    p1, p2 = pair
                    img1, alt1, headline1 = p1
                    img2, alt2, headline2 = p2
                    f.write(f'{i+1},{headline1},{headline2}\n')
                    f.write(f',https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.aajtak.in/{urllib.parse.quote(category)}/{img1},https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.aajtak.in/{urllib.parse.quote(category)}/{img2}\n')
                    f.write(f',{alt1},{alt2}\n')

        except:
            continue

In [29]:
create_file('output/images/www.aajtak.in/')

In [27]:
import shutil

for cat in os.listdir('output/images/www.aajtak.in/'):
    try:
        df = pd.read_csv(f'output/images/www.aajtak.in/{cat}/labels.csv')
        n, _ = df.shape
        if n + 1 == len(os.listdir(f'output/images/www.aajtak.in/{cat}')) and n == 10:
            print(cat, n, len(os.listdir(f'output/images/www.aajtak.in/{cat}'))-1)
        else:
            shutil.rmtree(f'output/images/www.aajtak.in/{cat}')
    except:
        continue

Relationship 10 10
Lifestyle News 10 10
Fashion 10 10
Tour and Tourism 10 10
Movie Reviews 10 10
Festivals 10 10
Police and Intelligence 10 10
Cricket 10 10
Tennis 10 10


In [30]:
num_pairs = []

for file in os.listdir():
    if file.startswith('www.aajtak.in_pairs'):
        try:
            df = pd.read_csv(file)
            num_pairs.append((int(list(df.iloc[-3])[0]), file))
        except:
            continue

In [31]:
sorted_array = sorted(num_pairs, key=lambda x: x[0])
sorted_array

[(45, 'www.aajtak.in_pairs_Relationship.csv'),
 (45, 'www.aajtak.in_pairs_Lifestyle News.csv'),
 (45, 'www.aajtak.in_pairs_Fashion.csv'),
 (45, 'www.aajtak.in_pairs_Tour and Tourism.csv'),
 (45, 'www.aajtak.in_pairs_Movie Reviews.csv'),
 (45, 'www.aajtak.in_pairs_Festivals.csv'),
 (45, 'www.aajtak.in_pairs_Police and Intelligence.csv'),
 (45, 'www.aajtak.in_pairs_Cricket.csv'),
 (45, 'www.aajtak.in_pairs_Tennis.csv')]

In [32]:
import random
random.shuffle(sorted_array)
print(sorted_array)

[(45, 'www.aajtak.in_pairs_Festivals.csv'), (45, 'www.aajtak.in_pairs_Cricket.csv'), (45, 'www.aajtak.in_pairs_Movie Reviews.csv'), (45, 'www.aajtak.in_pairs_Relationship.csv'), (45, 'www.aajtak.in_pairs_Tour and Tourism.csv'), (45, 'www.aajtak.in_pairs_Lifestyle News.csv'), (45, 'www.aajtak.in_pairs_Police and Intelligence.csv'), (45, 'www.aajtak.in_pairs_Tennis.csv'), (45, 'www.aajtak.in_pairs_Fashion.csv')]
