# Zee News Website Scraping


This algorithm uses Beautiful Soup to scrape news from Zee News Website (20 at a time) and stores it in a csv file.
This csv file acts as a dataset used to determine whether the posted news is fake or real. 

## Importing Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import datetime
import csv
import  pandas as pd
import glob
import os
from textblob import TextBlob
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from pytesseract import *
#Paste your path to tesseract.exe below
pytesseract.tesseract_cmd= r'C:\Users\Sakshi\AppData\Local\Tesseract-OCR\tesseract.exe'
from PIL import Image
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import io

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Sakshi\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sakshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sakshi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Initializing Variables

In [2]:
archive_page_url = "https://zeenews.india.com/archives/index.html"
initial_news_url = "https://zeenews.india.com/regional/prithviraj-sukumarans-viral-pic-with-the-birthday-boy-and-best-burger-chef-dulquer-salmaan-2298855.html"

In [3]:
news_list = []
news_url_list = []
page_url = []

## Creating a Folder

In [4]:
if os.path.exists('Data'):
    print('Directory Exists')
else:
    os.mkdir('Data')

Directory Exists


## BS4 Page Fetcher

In [5]:
def getandparseurl(url):
    from urllib.request import Request, urlopen
    
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html.parser')
    return(soup)

## Main News Text Extractor

In [6]:
def getmainnews(main_url):
    soup = getandparseurl(main_url)
    result = requests.get(main_url)
    status_code = str(result.status_code)
    #print(status_code)
    #print(type(soup))

    news = soup.find('div', class_="article").text
    news = news.replace('\n', ' ')
    
    #print(news)
    
    return news

## Extracts URLs of all news on a given page from latest news

In [7]:
def getmainnewsfromlatest(current_page_url):
    soup = getandparseurl(current_page_url)
    soup = soup.find("div", class_="content")
    soup = soup.find("div", class_="view-content")
    
    a_tags = soup.findAll("a")
    
    all_news_urls = []
    
    for string in a_tags:
        if "http://zeenews.india.com" in string:
            continue
        else:
            a_tags.remove(string)
    
    for i in range(0, len(a_tags)):
        all_news_urls.append(a_tags[i].get('href'))
    
    '''number = 1
    for link in all_news_urls:
        print(number, link)
        number += 1'''
    
    return all_news_urls

## Fetches next news archive page from a given page URL

In [8]:
def getnextpageurl(current_page_url):
    soup = getandparseurl(current_page_url)
    soup = soup.find("div", class_="content")
    soup = soup.find("div", class_="text-center")
    next_page_url = "http://zeenews.india.com" + str(soup.find("li", class_="next last").a.get('href'))
    
    return next_page_url

## Extracts and returns date of a given news URL in string format

In [9]:
def getnewsdate(current_page_url):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    soup = getandparseurl(current_page_url)
    soup = soup.find("div", class_="article-left-col sidebar flat").aside
    soup = soup.findAll("div", class_="write-block margin-bt20px")
    for div in soup:
        if any(month in str(div) for month in months) and 'IST' in str(div):
            soup = str(div)
            break
    soup = soup[39:51]
    soup = str(soup).split(' ')
    
    for month in months:
        if soup[0].casefold() == month.casefold():
            mm = str(months.index(month) + 1)
            mm = "{0:0=2d}".format(int(mm))
            break
    dd = soup[1][0:2]
    yyyy = soup[2]
    date = dd +'-' + mm + '-' + yyyy
    
    return date

## News Title Extractor

In [10]:
def getnewstitle(current_page_url):
    soup = getandparseurl(current_page_url)
    title = soup.find("div", class_="article-head-block margin-bt20px").h1.text
    
    return(title)

## Extracting Keywords from Data

In [11]:
def clean_text(keyword):
    temp = []
    keyword = re.sub('[^a-zA-Z]',' ',keyword)
    ss=nltk.tokenize.sent_tokenize(keyword)
    tokenized_sent=[nltk.word_tokenize(sent) for sent in ss]
    pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
    for i in pos_sentences:
        print(i)
        for j in i:
            print(j[1])
            if j[1] == 'NNP'or j[1] == 'NNS':
                temp.append(j[0])
    return temp


## Similarity check function

In [12]:
def news_processing(sentence):
        sentence_1 = sentence
        text = str("{0:0=2d}".format(datetime.datetime.now().day)) + '-' + str("{0:0=2d}".format(datetime.datetime.now().month)) + '-' + str(datetime.datetime.now().year)
        file = text +'.csv'
        data = os.path.join(r'C:\Data', text, 'data',file)
        data=pd.read_csv(data,  encoding='cp1252')
        data_1= data["NEWS"]
        data_2 = data['NEWS URL']
        super_1 = []
        super_2 = []
        super_3 = []
        focus_sentence = sentence_1
        focus_sentence = " ".join(re.findall("[a-zA-Z]+", focus_sentence)) 
        res = []
        sent_list = []
        urls = []
        for row,url in zip(data_1,data_2):
            sent=sent_tokenize(row)
            for sentence in sent:
                sentence = " ".join(re.findall("[a-zA-Z]+", sentence)) 
                corpus = [focus_sentence , sentence]
                vectorizer = TfidfVectorizer()
                trsfm=vectorizer.fit_transform(corpus)
                result = cosine_similarity(trsfm)[0][1]
                res.append(result.tolist())
                sent_list.append(sentence)
                urls.append(url)
        print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
        final = res.index(max(res))
        final_sentence = sent_list[final]
        final_url = urls[final]
        super_1.append(final)
        super_2.append(final_sentence)
        super_3.append(final_url)
        print(final_sentence, max(res))
        final = super_1.index(max(super_1))
        final_sentence = super_2[final]
        urls = super_3[final]


        return ('Input News: ',sentence_1, 'Similar News', final_sentence,'Similarity Score: ', max(res),urls)

## Main function that calls all task specific functions in order

In [13]:
def web_scraper(scrape_news_count):
    current_page_url = archive_page_url
    while current_page_url != "http://zeenews.india.com/archives/index.html?page=19" and len(news_url_list) <scrape_news_count:
        news_url_list.extend(getmainnewsfromlatest(current_page_url))
        current_page_url = getnextpageurl(current_page_url)
    if current_page_url == "http://zeenews.india.com/archives/index.html?page=19" and len(news_url_list) <scrape_news_count:
        news_url_list.extend(getmainnewsfromlatest(current_page_url))
    
    del news_url_list[scrape_news_count:]
    
    keyword_filtered_list = [['NEWS URL', 'NEWS TITLE', 'NEWS DATE', 'NEWS']]
    
    print(len(news_url_list), "News Extracted in Total.\n")
    
    for url in news_url_list:
        sub_arr = []

        sub_arr.append(url)
        sub_arr.append(getnewstitle(url))
        sub_arr.append(getnewsdate(url))
        sub_arr.append(getmainnews(url))
        keyword_filtered_list.append(sub_arr)
        
    print('\n')
    
    #######################################Making folders and exporting data to csv and excel################################
    
    path = r'C:\Data'
    
    path_sentence = str("{0:0=2d}".format(datetime.datetime.now().day)) + '-' + str("{0:0=2d}".format(datetime.datetime.now().month)) + '-' + str(datetime.datetime.now().year)
    path = os.path.join(path, path_sentence)
    print(path)
    if os.path.exists(path):
        print('Directory Exists')
    else:
        os.mkdir(path)
        
    data = path + '\data'
    if os.path.exists(data):
            print('Directory Exists')
    else:
        os.mkdir(data)
    
    os.chdir(data)
    
    csv_name = path_sentence + '.csv'
    with open(csv_name, 'w', errors='ignore') as file:
        writer = csv.writer(file)
        for i in range(0, len(keyword_filtered_list)):
            writer.writerow(keyword_filtered_list[i])
            
    
    read_csv = pd.read_csv(csv_name, encoding='cp1252')
    xlsx_name = path_sentence + '.xlsx'
    read_csv.to_excel(xlsx_name, index=None, header=True)
    file.close()
    
    return keyword_filtered_list

## Testing the Code

Pass any news or keyword that you want to scrape from Zee News website and the number of days you want to scrape from in the function given below 

In [14]:
scrape_news_count = 20 

if scrape_news_count > 600:
    print('ERROR: Total number of latest news in the database are 600. Please enter a number less than or equal to 600')
else:
    keyword_filtered_list = web_scraper(scrape_news_count)
    print('\n')
    for row in keyword_filtered_list:
        print(row, "\n")
    print('\n')
    j = 1

20 News Extracted in Total.



C:\Data\17-10-2020


['NEWS URL', 'NEWS TITLE', 'NEWS DATE', 'NEWS'] 

['http://zeenews.india.com/india/indian-railways-reviews-security-crowd-management-enforcement-of-covid-19-protocol-as-footfalls-increase-ahead-of-festive-season-2318104.html', 'Indian Railways reviews security, crowd management, enforcement of COVID-19 protocol as footfalls increase ahead of festive season', '17-10-2020', ' New Delhi: The Indian Railways has reviewed its security, crowd management and enforcement of COVID-19 protocol, as footfalls have started increasing ahead of the festive season. "The railway stations and trains will witness a manifold increase in footfall during the upcoming festive season and to face the challenges posed by increased footfall effectively, especially during the COVID-19 outbreak, Chairman-cum-CEO/Railway Board, Member (Operations and Business Development) and DG/RPF interacted with officers of the field formation at zonal and divisional levels thr

Once the news is scraped and the csv file is created, you can check for the contents of the csv file from the Directory you have created it in. 

Finally, formulate a news/tweet/post by referring to the csv file created and pass it to the function as shown below.

In [16]:
news_processing('Indian Railways has reviewed its security, crowd management and enforcement of COVID-19 protocol')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
New Delhi The Indian Railways has reviewed its security crowd management and enforcement of COVID protocol as footfalls have started increasing ahead of the festive season 0.5876802588817186


('Input News: ',
 'Indian Railways has reviewed its security, crowd management and enforcement of COVID-19 protocol',
 'Similar News',
 'New Delhi The Indian Railways has reviewed its security crowd management and enforcement of COVID protocol as footfalls have started increasing ahead of the festive season',
 'Similarity Score: ',
 0.5876802588817186,
 'http://zeenews.india.com/india/indian-railways-reviews-security-crowd-management-enforcement-of-covid-19-protocol-as-footfalls-increase-ahead-of-festive-season-2318104.html')