In [1]:
import requests     #request to the website
from bs4 import BeautifulSoup     #bs4 is used for scrapping, beautifulsoup is to convert html page into understandable format
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os          #give info about the operating system

In [2]:
seed_urls = [ 'https://inshorts.com/en/read/technology',
              'https://inshorts.com/en/read/sports',
              'https://inshorts.com/en/read/world' ]                 # SOURCE WEBSITE URL ADDRESSES stored in seed_urls list

In [3]:
def build_dataset(seed_urls):             #seed_url is the input for the particular function --build_dataset
    news_data = []                  #created a list named as news_data
    for url in seed_urls:                     #calling the specific url from the 3 urls,present in the seed_urls
        news_category=url.split('/')[-1]     #calling the category of every url by index number(-1)--tech or sports or world
        data=requests.get(url)      #it will download all the contents of that URL in html format, & save it in data(object)
        soup = BeautifulSoup(data.content, 'html.parser')  #parsing the content of urls using the html.parser
        
        all_cards = soup.find_all('div', class_="news-card")
        
        news_articles = [{
            'news_headline': card.find('span', attrs={"itemprop":"headline"}).string,
            'news_article': card.find('div', attrs={"itemprop":"articleBody"}).string,
            'news_category': news_category
        } for card in all_cards]   #stored all the html content of required cards in a dictionary with keys:values
        news_data.extend(news_articles)  #adding news_articles to the news_data
    #end of for loop
    df = pd.DataFrame(news_data)         #converting the dataframe into df
    df = df[['news_headline', 'news_article', 'news_category']]   #assigning column names
    return df

OPERATION STEPS OF ABOVE CELL
1. For loop which is going through all the seed urls
2. For each url which is my category, --> getting all the cards
3. Once we got all the cards, i am scraping news_headlines, news_articles and news_category

In [4]:
news_df = build_dataset(seed_urls)
news_df.head(15)

Unnamed: 0,news_headline,news_article,news_category
0,Airtel promises to offer free outgoing calls t...,"Interconnect Usage Charge of 6p/minute, propos...",technology
1,"My WhatsApp was hacked, friend saw man masturb...",Actor Sanchay Goswami has claimed his WhatsApp...,technology
2,"Google India hires former Star, Disney India M...",Google India has appointed former Star and Dis...,technology
3,US charges 2 ex-Twitter employees with spying ...,The US Justice Department on Wednesday charged...,technology
4,Stock market app Robinhood's glitch lets man t...,A glitch in US-based stock trading app Robinho...,technology
5,China bans online gaming from 10 pm to 8 am fo...,China has banned online gaming from 10 pm to 8...,technology
6,"Woman loses ₹85,000 on calling restaurant's fa...","A 34-year-old woman was cheated of ₹85,000 aft...",technology
7,Glitch sends past unsent text messages from no...,A glitch sent unsent messages from or around V...,technology
8,Why are thousands of Indian Twitter users movi...,"Thousands of Indian Twitter users, including j...",technology
9,Google parent investigates how sex abuse compl...,Google's parent firm Alphabet's board is inves...,technology


In [5]:
news_df.news_category.value_counts()

sports        25
technology    24
world         24
Name: news_category, dtype: int64