In [1]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from io import BytesIO
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import os
import random
import re
import pandas as pd
import hashlib
from datetime import datetime
import urllib.parse
from json import dumps, loads
from shutil import copy2

In [3]:
os.chdir('/content/drive/MyDrive/research-similarity/Scraping')

In [9]:
url = 'https://www.hindustantimes.com/entertainment/bollywood'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')

In [11]:
def create_directories(base_url, categories, label='images'):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join('output', label, urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [12]:
def get_articles_links(url):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    main_container = soup.find('section', class_='listingPage')
    articles_links = [f"https://www.hindustantimes.com/entertainment/bollywood{soup.find('div', class_='cartHolder bigCart track timeAgo').find('a')['href']}"]

    for div in main_container.find_all('div', class_='cartHolder listView track timeAgo'):
        link = div.find('a')['href']
        if link not in articles_links:
            articles_links.append(f'https://www.hindustantimes.com{link}')

    return articles_links

def article_scrapper(url):
    articles_links = get_articles_links(url)
    data = []

    def helper_scrapper(url):
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        try:
            story_main_div = soup.find('div', id='storyMainDiv')
            headline = story_main_div.find('h1').text
            datetime_str = story_main_div.find('div', class_='dateTime secTime storyPage').text.strip()[:-4]
            time = datetime.strptime(datetime_str, "%b %d, %Y %I:%M %p")
            images = []

            img = story_main_div.find('img')
            img_alt = img.get('alt', '')  # Safely get the 'alt' attribute
            img_src = img.get('src', '')  # Safely get the 'src' attribute
            images.append((img_alt, img_src))

            return headline, time, images
        except:
            return None

    for url in articles_links:
        result = helper_scrapper(url)
        if result is not None:
            data.append(result)

    return data

In [13]:
def get_latest_articles(data, n=10):
    seen_headlines = set()
    unique_data = []

    for record in data:
        headline = record[0]
        if headline not in seen_headlines:
            seen_headlines.add(headline)
            unique_data.append(record)

    return sorted(unique_data, key=lambda x: x[1], reverse=True)[:n]

def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(img_url, headers=headers)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir, data):

    with open(os.path.join(save_dir, 'labels.csv'), 'w') as f:
        f.write('image number,alt,article_heading\n')

    records = []

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for x, tuple in enumerate(data):
            headline, _, images_list = tuple
            for i, img in enumerate(images_list):
                alt_txt, img_url = img
                if img_url and not img_url.startswith('data:'):
                    img_url = urljoin(category_url, img_url)
                    combined_str = f"{alt_txt}{headline}".encode()
                    img_name = f'image_{x+1}_{i+1}.jpg'
                    records.append(f'{img_name},{alt_txt.replace(",", "")},{headline.replace(",", "")}\n')
                    futures.append(executor.submit(download_image, img_url, save_dir, img_name))

        with open(os.path.join(save_dir, 'labels.csv'), 'a') as f:
            f.writelines(records)

        for future in as_completed(futures):
            future.result()

In [15]:
categories = [
    ('Bollywood', 'https://www.hindustantimes.com/entertainment/bollywood'),
    ('Football', 'https://www.hindustantimes.com/sports/football'),
    ('Lifestyle', 'https://www.hindustantimes.com/lifestyle'),
    ('World News', 'https://www.hindustantimes.com/world-news'),
    ('Technology', 'https://www.hindustantimes.com/technology'),
    ('Science', 'https://www.hindustantimes.com/science'),
    ('Horoscope', 'https://www.hindustantimes.com/astrology/horoscope'),
    ('Other Sports', 'https://www.hindustantimes.com/sports/others'),
    ('Tennis', 'https://www.hindustantimes.com/sports/tennis'),
    ('Sports', 'https://www.hindustantimes.com/sports'),
    ('Hockey', 'https://www.hindustantimes.com/sports/hockey'),
    ('Music', 'https://www.hindustantimes.com/entertainment/music'),
    ('Hollywood', 'https://www.hindustantimes.com/entertainment/hollywood'),
    ('Web Series', 'https://www.hindustantimes.com/entertainment/web-series'),
    ('Telugu Cinema', 'https://www.hindustantimes.com/entertainment/telugu-cinema'),
    ('Tamil Cinema', 'https://www.hindustantimes.com/entertainment/tamil-cinema'),
    ('TV', 'https://www.hindustantimes.com/entertainment/tv'),
    ('Entertainment Others', 'https://www.hindustantimes.com/entertainment/others'),
    ('Fashion', 'https://www.hindustantimes.com/lifestyle/fashion'),
    ('Health', 'https://www.hindustantimes.com/lifestyle/health'),
    ('Relationships', 'https://www.hindustantimes.com/lifestyle/relationships'),
    ('Art and Culture', 'https://www.hindustantimes.com/lifestyle/art-culture'),
    ('Travel', 'https://www.hindustantimes.com/lifestyle/travel'),
    ('Recipes', 'https://www.hindustantimes.com/lifestyle/recipe'),
    ('Festivals', 'https://www.hindustantimes.com/lifestyle/festivals'),
    ('Pet', 'https://www.hindustantimes.com/lifestyle/pet'),

]

base_url = 'https://www.hindustantimes.com/'

base_dir = create_directories(base_url, categories)

In [16]:
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    category_dir = os.path.join(base_dir, category)
    try:
        data = article_scrapper(category_url)
        download_images(category_url, category_dir, get_latest_articles(data))
    except:
        print(category)
        continue

Downloading images for every category: 100%|██████████| 26/26 [06:21<00:00, 14.67s/it]


In [21]:
def create_file(base_dir):
    for category in os.listdir(base_dir):
        try:
            df = pd.read_csv(f'{base_dir}{category}/labels.csv')
            n, _ = df.shape

            pairs = []

            for i in range(n):
                for j in range(i + 1, n):
                    article_1 = int(re.search(r'\d+(?=_|$)',list(df.iloc[i])[0]).group())
                    article_2 = int(re.search(r'\d+(?=_|$)',list(df.iloc[j])[0]).group())
                    if article_1 != article_2:
                        pairs.append((list(df.iloc[i]), list(df.iloc[j])))

            with open(f'{base_dir.split("/")[2]}_pairs_{category}.csv', 'w') as f:
                for i, pair in enumerate(pairs):
                    p1, p2 = pair
                    img1, alt1, headline1 = p1
                    img2, alt2, headline2 = p2
                    f.write(f'{i+1},{headline1},{headline2}\n')
                    f.write(f',https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.hindustantimes.com/{urllib.parse.quote(category)}/{img1},https://raw.githubusercontent.com/ayainfida/news-scrapper/main/output/images/www.hindustan.com/{urllib.parse.quote(category)}/{img2}\n')
                    f.write(f',{alt1},{alt2}\n')

        except:
            continue

In [22]:
create_file('output/images/www.hindustantimes.com/')

In [23]:
num_pairs = []

for file in os.listdir():
    if file.startswith('www.hindustantimes.com_pairs'):
        try:
            df = pd.read_csv(file)
            num_pairs.append((int(list(df.iloc[-3])[0]), file))
        except:
            continue

In [24]:
sorted_array = sorted(num_pairs, key=lambda x: x[0])
sorted_array

[(45, 'www.hindustantimes.com_pairs_Bollywood.csv'),
 (45, 'www.hindustantimes.com_pairs_Football.csv'),
 (45, 'www.hindustantimes.com_pairs_Lifestyle.csv'),
 (45, 'www.hindustantimes.com_pairs_World News.csv'),
 (45, 'www.hindustantimes.com_pairs_Technology.csv'),
 (45, 'www.hindustantimes.com_pairs_Horoscope.csv'),
 (45, 'www.hindustantimes.com_pairs_Other Sports.csv'),
 (45, 'www.hindustantimes.com_pairs_Tennis.csv'),
 (45, 'www.hindustantimes.com_pairs_Sports.csv'),
 (45, 'www.hindustantimes.com_pairs_Hockey.csv'),
 (45, 'www.hindustantimes.com_pairs_Music.csv'),
 (45, 'www.hindustantimes.com_pairs_Hollywood.csv'),
 (45, 'www.hindustantimes.com_pairs_Web Series.csv'),
 (45, 'www.hindustantimes.com_pairs_Telugu Cinema.csv'),
 (45, 'www.hindustantimes.com_pairs_Tamil Cinema.csv'),
 (45, 'www.hindustantimes.com_pairs_TV.csv'),
 (45, 'www.hindustantimes.com_pairs_Entertainment Others.csv'),
 (45, 'www.hindustantimes.com_pairs_Fashion.csv'),
 (45, 'www.hindustantimes.com_pairs_Health.cs

In [29]:
import random
random.shuffle(sorted_array)
print(sorted_array)

[(45, 'www.hindustantimes.com_pairs_Tamil Cinema.csv'), (45, 'www.hindustantimes.com_pairs_Telugu Cinema.csv'), (45, 'www.hindustantimes.com_pairs_Technology.csv'), (45, 'www.hindustantimes.com_pairs_Horoscope.csv'), (45, 'www.hindustantimes.com_pairs_Web Series.csv'), (45, 'www.hindustantimes.com_pairs_Festivals.csv'), (45, 'www.hindustantimes.com_pairs_Tennis.csv'), (45, 'www.hindustantimes.com_pairs_Bollywood.csv'), (45, 'www.hindustantimes.com_pairs_TV.csv'), (45, 'www.hindustantimes.com_pairs_Hockey.csv'), (45, 'www.hindustantimes.com_pairs_Travel.csv'), (45, 'www.hindustantimes.com_pairs_Fashion.csv'), (45, 'www.hindustantimes.com_pairs_Pet.csv'), (45, 'www.hindustantimes.com_pairs_World News.csv'), (45, 'www.hindustantimes.com_pairs_Lifestyle.csv'), (45, 'www.hindustantimes.com_pairs_Recipes.csv'), (45, 'www.hindustantimes.com_pairs_Relationships.csv'), (45, 'www.hindustantimes.com_pairs_Sports.csv'), (45, 'www.hindustantimes.com_pairs_Art and Culture.csv'), (45, 'www.hindustant

**In-Article**

In [2]:
url = 'https://www.hindustantimes.com/cricket/virat-kohlis-weakness-exposed-by-three-test-old-bangladesh-pacer-rohit-sharma-shubman-gill-fail-in-testing-conditions-101726721542883.html'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')

In [9]:
story_main_div = soup.find('div', id='storyMainDiv')
headline = story_main_div.find('h1').text
datetime_str = story_main_div.find('div', class_='dateTime secTime storyPage').text.strip()[:-4]
time = datetime.strptime(datetime_str, "%b %d, %Y %I:%M %p")

images = []

img = story_main_div.find('img')
img_alt = img.get('alt', '')  # Safely get the 'alt' attribute
img_src = img.get('src', '')  # Safely get the 'src' attribute
images.append({
    'img_url': img_src,
    'img_alt': img_alt
})

In [29]:
def in_article_scrapping(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    story_main_div = soup.find('div', id='storyMainDiv')
    headline = story_main_div.find('h1').text
    datetime_str = story_main_div.find('div', class_='dateTime secTime storyPage').text.strip()[:-4]
    time = datetime.strptime(datetime_str, "%b %d, %Y %I:%M %p")

    images = []
    sub_head = story_main_div.find("h2", class_="sortDec").text
    img = story_main_div.find('img')
    img_alt = img.get('alt', '')  # Safely get the 'alt' attribute
    img_src = img.get('src', '')  # Safely get the 'src' attribute
    images.append({
        'img_url': img_src,
        'img_alt': img_alt
    })
    paragraph = ''

    for para_tag in story_main_div.find_all('p'):
        paragraph += para_tag.text

    content = sub_head + ' ' + paragraph

    return url, headline, images, content

In [30]:
in_article_scrapping('https://www.hindustantimes.com/cricket/virat-kohlis-weakness-exposed-by-three-test-old-bangladesh-pacer-rohit-sharma-shubman-gill-fail-in-testing-conditions-101726721542883.html')

('https://www.hindustantimes.com/cricket/virat-kohlis-weakness-exposed-by-three-test-old-bangladesh-pacer-rohit-sharma-shubman-gill-fail-in-testing-conditions-101726721542883.html',
 "Virat Kohli's weakness exposed by three-Test old Bangladesh pacer; Rohit Sharma, Shubman Gill left clueless",
 [{'img_url': 'https://www.hindustantimes.com/ht-img/img/2024/09/19/550x309/MixCollage-19-Sep-2024-10-55-AM-3449_1726723614845_1726723621879.jpg',
   'img_alt': 'Virat Kohli (L) and Rohit Sharma after their dismissals in the 1st Test(PTI)'}],
 "Rohit Sharma, Shubman Gill, and Virat Kohli failed to make a mark in testing conditions during the first Test against Bangladesh. Team India’s return to Test cricket began on a rocky note as they faced a top-order collapse on the opening day of the first Test against Bangladesh in Chennai. After being asked to bat on an unusually pace-friendly surface at the MA Chidambaram Stadium, India struggled against the skilful bowling of 24-year-old Hasan Mahmud.The 

In [34]:
urls = [
    "https://www.hindustantimes.com/india-news/bengal-medical-council-cancels-registration-of-rg-kars-ex-principal-sandip-ghosh-101726741834112.html",
    "https://www.hindustantimes.com/cricket/old-lady-becomes-internet-sensation-after-cheering-every-ashwin-boundary-in-india-vs-bangladesh-1st-test-101726742724227.html",
    "https://www.hindustantimes.com/entertainment/bollywood/triptii-dimri-reveals-relatives-told-her-parents-no-one-is-going-to-marry-her-after-she-became-an-actor-101726741964715.html",
    "https://www.hindustantimes.com/entertainment/bollywood/emergency-release-date-court-gives-deadline-to-cbfc-for-deciding-kangana-ranaut-starrers-certification-101726734842304.html",
    "https://www.hindustantimes.com/cricket/india-vs-bangladesh-live-score-1st-test-match-day-1-ind-vs-ban-series-2024-latest-cricket-scorecard-updates-chennai-101726651671273.html",
    "https://www.hindustantimes.com/world-news/us-news/playing-politics-donald-trump-jd-vance-react-to-fed-s-big-interest-rate-cut-will-this-move-impact-us-election-101726738942922.html",
    "https://www.hindustantimes.com/india-news/pm-narendra-modi-lambasts-rahul-gandhis-devta-remark-naxal-mindset-imported-from-other-religions-101726742680436.html",
    "https://www.hindustantimes.com/india-news/hindu-outfit-stages-protest-in-chennai-seeks-ban-on-india-bangladesh-test-series-101726738247737.html",
    "https://www.hindustantimes.com/world-news/israel-arrests-citizen-over-iranian-plot-to-kill-pm-benjamin-netanyahu-other-top-officials-101726740411460.html",
    "https://www.hindustantimes.com/india-news/centre-reacts-to-ey-pune-employee-anna-sebastian-perayils-tragic-death-taken-up-the-complaint-101726732832865.html"
]

results_inarticle = []

for url in urls:
    results_inarticle.append(in_article_scrapping(url))
    print('done')

ConnectionError: HTTPSConnectionPool(host='www.hindustantimes.com', port=443): Max retries exceeded with url: /india-news/bengal-medical-council-cancels-registration-of-rg-kars-ex-principal-sandip-ghosh-101726741834112.html (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x1262f0a30>: Failed to establish a new connection: [Errno 61] Connection refused'))

NameError: name 'new_df' is not defined