In [10]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver	
import pandas as pd
from datetime import datetime

In [1]:
# What must be the columns of the DF 
# Title, Article, category, Link, Source, Author(optional), Date
# Title[string]: Short catchy gist of the article
# Article[string]: Main content
# Category[string]: the type of article
# Link[string]: Href of the article
# Source[string]: Name of the news site
# Author[string]: Name of the writer
# Date[Date]: Date of publishing

# What must I do to correct the previously collected data?
# Well good thing is I almost have the sources, If I somehow manage to get links then date and author are easy. 
# However, getting the link of the article is a mystery of it's own. There is no direct path for getting a link. 
# I scraped what I found on the front page that doesn't mean it's going to be there all the time. 
# I'll have to think about that and clean the data as much as it is possible before manually labelling it. 

In [2]:
# Labels of the Data
# Political => 0: rightist; 0.5: centrist or neutral; 1: leftist
# framing(traditional) => 0: negative; 0.5: neutral; 1: positive
# sensationalism => 0: No exageration; 1: highly exagerated
# opinionated(selection bias, and stereotyping) => 0: low to none opinionated or facts; 1: highly opinionated

# There are 4 ouput classes with independent degree of reliability!

In [3]:
def get_bbc_articles():
    categories_to_scrape = [
        "https://www.bbc.com/news/world/asia", 
        "https://www.bbc.com/business", 
        "https://www.bbc.com/news", 
        "https://www.bbc.com/innovation"
    ]
    
    articles_links = []
    for category in categories_to_scrape:
        cat_res = requests.get(category)
        cat_soup = BeautifulSoup(cat_res.content, "html.parser")
        
        for link in cat_soup.find_all("a", href=True):
            href = link["href"]
            if href.startswith("/news/articles/") and href not in articles_links:
                articles_links.append(["https://www.bbc.com" + href, category[20:]])
    
    urls_to_scrape = articles_links
    articles = []
    categories = []
    titles = []
    links = []
    sources = []
    authors = []
    dates = []
    
    for url in urls_to_scrape:
        response = requests.get(url=url[0])
        soup = BeautifulSoup(response.content, "html.parser")
        
        text = ""
        body = soup.body
        
        # Extract heading
        heading_tag = body.find("h1", class_="sc-518485e5-0 itISwu")
        heading = heading_tag.get_text(strip=True) if heading_tag else "Unknown Title"
        
        # Extract article text
        paragraphs = body.find('article').find_all("p", class_="sc-eb7bd5f6-0 fezwLZ")
        for para in paragraphs:
            text += para.get_text() + " "
        
        # Extract author (optional)
        author_tag = body.find_all("span", class_="sc-b42e7a8f-7 kItaYD") 
        authors_ = []
        for auth in author_tag:
            authors_.append(auth.get_text())
        
        # Extract date
        date_tag = body.find("time")
        date = date_tag.get("datetime") if date_tag else "Unknown Date"
        
        # Store extracted data
        articles.append(text)
        categories.append(url[1])
        titles.append(heading)
        authors.append(', '.join(authors_))
        links.append(url[0])
        sources.append("BBC News")
        dates.append(date)
    
    return articles, categories, titles, links, sources, authors, dates



In [4]:
def get_mint_articles():
    categories_to_scrape = [
        "https://www.livemint.com/news/india",
        "https://www.livemint.com/news/world",
    ]
    article_links = []

    for category in categories_to_scrape:
        cat_res = requests.get(category)
        cat_soup = BeautifulSoup(cat_res.content, "html.parser")
        for link in cat_soup.find_all("a", href=True):
            href = link["href"]
            if href.startswith("/news/") and href not in article_links:
                article_links.append(["https://www.livemint.com" + href, f"{category[25:]}"])

    urls_to_scrape = article_links
    articles = []
    cats = []
    titles = []
    links = []
    sources = []
    authors = []
    dates = []

    try:
        for url in urls_to_scrape:
            response = requests.get(url=url[0])
            soup = BeautifulSoup(response.content, "html.parser")
            text = ""
            body = soup.body
            
            if not body:
                pass
            else:
                # Extract title
                heading = body.find('h1', id="article-0")
                title_text = heading.get_text(strip=True) if heading else "Unknown Title"

                # Extract article text
                vals = body.find("div", class_="storyPage_storyContent__m_MYl")
                if vals:
                    paragraph_section = vals.find("div", class_="storyParagraph")
                    if paragraph_section:
                        paragraphs = paragraph_section.find_all("p")
                        for val in paragraphs:
                            text += val.get_text()

                # Extract author
                author_tag = body.find("div", class_="storyPage_authorDesc__zPjwo")
                
                author_text = author_tag.find('a').get_text(strip=True) if  author_tag and author_tag.find('a') else "Unknown"

                # Extract date
                date_text = author_tag.find('span').get_text(strip=True) if author_tag else "Unknown Date"

                # Append extracted data
                articles.append(text)
                cats.append(url[1])
                titles.append(title_text)
                links.append(url[0])
                sources.append("Mint")
                authors.append(author_text)
                dates.append(date_text)

    except Exception as e:
        print(e)
        print(url)
        raise e

    return articles, cats, titles, links, sources, authors, dates


In [5]:
def get_ie_articles():
    categories_to_scrape = [
        "https://indianexpress.com/section/india/",
        "https://indianexpress.com/section/world/",
    ]
    article_links = []

    for category in categories_to_scrape:
        cat_res = requests.get(category)
        cat_soup = BeautifulSoup(cat_res.content, "html.parser")
        for link in cat_soup.find_all("a", href=True):
            href = link["href"]
            if "/article/" in href and href not in article_links:
                article_links.append([href, f"{category[26:]}"])

    urls_to_scrape = article_links
    articles = []
    cats = []
    titles = []
    links = []
    sources = []
    authors = []
    dates = []

    try:
        for url in urls_to_scrape:
            response = requests.get(url=url[0])
            soup = BeautifulSoup(response.content, "html.parser")
            text = ""
            body = soup.body

            # Extract title
            heading = body.find('h1', id="main-heading-article")
            title_text = heading.get_text(strip=True) if heading else "Unknown Title"

            # Extract article text
            vals = body.find("div", class_="story_details")
            if vals:
                paragraphs = vals.find_all("p")
                for val in paragraphs:
                    text += val.get_text()

            # Extract author
            author_tag = body.find("div", id="storycenterbyline")
            author_text = author_tag.find('a').get_text(strip=True) if author_tag and author_tag.find('a') else "Unknown"

            # Extract date
            date_tag = body.find("span", itemprop="dateModified")
            date_text = date_tag['content'] if date_tag['content'] else "Unknown Date"

            # Append extracted data
            articles.append(text)
            cats.append(url[1])
            titles.append(title_text)
            links.append(url[0])
            sources.append("Indian Express")
            authors.append(author_text)
            dates.append(date_text)

    except Exception as e:
        print(e)
        print(url)
        raise e

    return articles, cats, titles, links, sources, authors, dates


In [6]:
def get_it_articles():
    categories_to_scrape = [
        "https://www.indiatoday.in/world/",
        "https://www.indiatoday.in/india/",
    ]
    article_links = []

    for category in categories_to_scrape:
        cat_res = requests.get(category)
        cat_soup = BeautifulSoup(cat_res.content, "html.parser")
        for link in cat_soup.find_all("a", href=True):
            href = link["href"]
            if (
                href.startswith("/world/us-news/") or href.startswith("/world/uk-news/") or href.startswith("/india/")
            ) and ('video' not in href) and href not in article_links:
                article_links.append(
                    ["https://www.indiatoday.in" + href, f"{category[25:]}"]
                )

    urls_to_scrape = article_links
    articles = []
    cats = []
    titles = []
    links = []
    sources = []
    authors = []
    dates = []

    try:
        for url in urls_to_scrape:
            response = requests.get(url=url[0])
            soup = BeautifulSoup(response.content, "html.parser")
            text = ""
            body = soup.body

            # Extract title
            heading = body.find(
                "h1", class_="jsx-ace90f4eca22afc7 Story_strytitle__MYXmR"
            )
            title_text = heading.get_text(strip=True) if heading else "Unknown Title"

            # Extract article text
            content_div = body.find(
                "div",
                class_="jsx-ace90f4eca22afc7 Story_description__fq_4S description paywall",
            )
            if content_div:
                paragraphs = content_div.find_all("p")
                for para in paragraphs:
                    text += para.get_text()

            # Extract author
            author_tag = body.find("div", class_="authors__by")
            author_text = (
                author_tag.find("div", class_="authdetaisl").get_text(strip=True)
                if author_tag and author_tag.find("div", class_="authdetaisl")
                else "Unknown"
            )

            # Extract date
            date_tag = body.find("div", class_="published__on")
            date_text = date_tag.find('div', class_='authdetaisl').get_text(strip=True) if date_tag and date_tag.find("div", class_="authdetaisl") else "Unknown"

            # Append extracted data
            articles.append(text)
            cats.append(url[1])
            titles.append(title_text)
            links.append(url[0])
            sources.append("India Today")
            authors.append(author_text)
            dates.append(date_text)

    except Exception as e:
        print(e)
        print(url)
        raise e

    return articles, cats, titles, links, sources, authors, dates

In [7]:
def make_data_object():
	articles_bbc, category_bbc, title_bbc, links_bbc, sources_bbc, authors_bbc, dates_bbc = get_bbc_articles()
	article_mint, category_mint, title_mint, links_mint, sources_mint, authors_mint, dates_mint = get_mint_articles()
	article_ie, category_ie, title_ie, links_ie, sources_ie, authors_ie, dates_ie = get_ie_articles()
	article_it, category_it, title_it, links_it, sources_it, authors_it, dates_it = get_it_articles()
	articles = articles_bbc + article_mint + article_ie + article_it
	category = category_bbc + category_mint + category_ie + category_it
	title = title_bbc + title_mint + title_ie + title_it
	links = links_bbc + links_mint + links_ie + links_it
	sources = sources_bbc + sources_mint + sources_ie + sources_it
	authors = authors_bbc + authors_mint + authors_ie + authors_it
	dates  = dates_bbc + dates_mint + dates_ie + dates_it
	data = pd.DataFrame({ "title": title, "article": articles, "category": category, 'links': links, 'sources': sources, 'authors': authors, 'dates': dates})
	data = data.drop_duplicates()
	return data


In [8]:
def make_data_csv(data):
	name = datetime.today().strftime('%Y-%m-%d_%H-%M')
	data.to_csv(f"../data/all_artice_{name}.csv", index=False)

In [9]:
def make_df(file_path):
	return pd.read_csv(file_path)

In [11]:
latest_data = make_data_object()
latest_data.drop_duplicates(subset=['title'], keep='first', inplace=True)
latest_data = latest_data.reset_index()
latest_data.drop(columns=['index'], inplace=True)

In [17]:
latest_data.count()

title       206
article     206
category    206
links       206
sources     206
authors     206
dates       206
dtype: int64

In [18]:
lab_art = pd.read_csv("../data/Usecase/all_labelle-articles.csv")
lab_art.dropna(inplace=True)

In [24]:
unlab_art = pd.read_csv("../data/Usecase/all_unlabelled_articles.csv")
unlab_art.count()

title             656
article           656
category          656
links             656
sources           656
authors           652
dates             656
political         482
framing           482
sensationalism    482
opinionated       482
dtype: int64

In [30]:
data = pd.concat([lab_art, unlab_art, latest_data], ignore_index=True)


In [31]:
data = data.drop_duplicates(subset=['title'], keep='first')
data = data.reset_index()
data.drop(columns=['index'], inplace=True)

In [32]:
data.count()

title             836
article           836
category          836
links             836
sources           836
authors           832
dates             836
political         482
framing           482
sensationalism    482
opinionated       482
dtype: int64

In [33]:
make_data_csv(data)