## 2 - Web Scraping for 1 Day

In [1]:
#import dependencies 
import requests
import bs4
from bs4 import BeautifulSoup
import csv
import os
import numpy as np
import pandas as pd
from timeit import default_timer as timer

#### Saving articles for 1 Day:

In [2]:
#function to request browser to get information
def get_url(url):
   return requests.get(url, {'headers':headers})

In [8]:
#specifies the header/user agent,ie, any software that retrieves and presents Web content for end users or is implemented using Web technologies. User agents include Web browsers, media players, and plug-ins that help in retrieving, rendering and interacting with Web content.
headers={
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6 Safari/605.1.15'
}

#creating empty dictionary where scraped data will be stored
dict_day = {'date_published':[],'date_updated':[],'time':[],
                     'headline':[],'sector/category':[],'synopsis':[],'full_text':[]}

#creates a list (not set) of all links scraped from archive of 1 day 
article_links = []

#creating 1st soup oject
soup1=bs4.BeautifulSoup(get_url("https://economictimes.indiatimes.com/archivelist/year-2022,month-5,starttime-44682.cms").text,'html.parser') #creates soup object for 1 day archive traversal
#https://economictimes.indiatimes.com/archivelist/year-2008,month-1,starttime-39448.cms

#date is scraped from the day page itself instead of going into individual news links
try:
    date_pub=soup1.find_all('b')[1].text
except:
    date_pub='NA'

#collecting links of all news articles published on a day
for ultag in soup1.find_all('ul', class_= 'content'): #looks for tag 'ul',class='content' under which links are present
    for litag in ultag.find_all('li'):
        for atag in litag.find_all('a'):
            if(atag.get('href')!='#'):
                link_reqd = "https://economictimes.indiatimes.com" + atag.get('href')
                article_links.append(link_reqd)

def transform(article_links):
    
    start=timer()

    #Begins scraping content that needs to be written in the file

    #Scrapes article level data from links collected for the day stored in all_links
    for links in article_links: 

        #published date is appended as many times as there are number of article links for the day
        dict_day['date_published'].append(date_pub)

        #creating soup object of each link so that we can traverse through each news article
        soup=bs4.BeautifulSoup(get_url(links).text,features='html.parser')

        #scrapes headline, synopsis, date_updated, time, sector, full_text from respective HTML class 
        try:
            dict_day['headline'].append(soup.find('h1').text)
        except:
            dict_day['headline'].append('NA')

        #to prevent errors in non-existent updated dates/different synopsis class names
        try:
            dict_day['synopsis'].append(soup.find('h2',class_="summary").text)
        except:
            dict_day['synopsis'].append('NA')

        try:
            dict_day['date_updated'].append(" ".join([str(item) for item in soup.find('time',class_="jsdtTime").text.rsplit(' ')[2:5]])) #first splits elements, then merges them to give date for last updated 
        except:
            dict_day['date_updated'].append('NA')

        try:   
            dict_day['time'].append(" ".join([str(item) for item in soup.find('time',class_="jsdtTime").text.rsplit(' ')[5:8]])) #first splits elements, then merges them to give time for last updated
        except:
            dict_day['time'].append('NA')

        try:
            dict_day['sector/category'].append(links.rsplit('/')[4:6]) #gets sectors/category from the url itself
        except:
            dict_day['sector/category'].append('NA')

        try:
            try:
                partial_text=soup.find('article',class_='artData clr').text
                all_text=partial_text[:partial_text.find("Experience Your Economic Times")-21]
                dict_day['full_text'].append(all_text)

            except:
                some_text=soup.find('article',class_='artData clr paywall').text
                final_text=some_text[:some_text.find("Experience Your Economic Times")-23]
                dict_day['full_text'].append(final_text)
                #partial_text_json=soup.find_all('script',type="application/ld+json")[1].get_text()
                #all_text_json=partial_text_json[partial_text_json.find("articleBody")+14:partial_text_json.find("image")-15]
                #dict_day['full_text'].append(all_text_json)

        except AttributeError:
            dict_day['full_text'].append('NA')

    #convert dictionary to a dataframe so as to save as csv
    df=pd.DataFrame(dict_day)
    #print(dict_day)

    #file name is unique as it's linked to unique article id visible in the url
    file_name=date_pub

    #Saving File as 'article id'.csv
    #df.to_csv('./{}.csv'.format(file_name), sep=',')
    print(df)

    end=timer()

    #print(f'\nTime to complete: {end-start:.2f}s\n')
    return("Start:",start,"End:",end,"Time taken: ",end-start)

In [9]:
transform(article_links)

    date_published   date_updated          time  \
0      1 May, 2022             NA            NA   
1      1 May, 2022             NA            NA   
2      1 May, 2022             NA            NA   
3      1 May, 2022             NA            NA   
4      1 May, 2022             NA            NA   
..             ...            ...           ...   
185    1 May, 2022  May 01, 2022,  11:26 PM IST   
186    1 May, 2022  May 02, 2022,  09:47 AM IST   
187    1 May, 2022  May 01, 2022,  11:33 PM IST   
188    1 May, 2022  May 02, 2022,  12:11 PM IST   
189    1 May, 2022  May 02, 2022,  12:02 AM IST   

                                              headline  \
0    Startups, tech firms fuel demand for coworking...   
1                                             404 page   
2    Innovation in the 5G space will lead to newer ...   
3                                             404 page   
4    Startups, tech firms fuel demand for coworking...   
..                                     

('Start:',
 544.839676412,
 'End:',
 645.202680748,
 'Time taken: ',
 100.36300433600002)

In [10]:
len(article_links)

190

## Documentation:


Experiment 2: Automating  process to retrieve articles for one month 

Contributors: Aanchal

Aim: Implement a sequential loop to retrieve articles for one month 

Dependencies: Beautiful Soup, Timer 

Method: 
We created two functions to: 
a) make url requests, 
b) create a dataframe of all articles for a day and save it as one day’s csv file 

(b) was the basis on  which all our further codes were built. Its pseudo-code was as follows:
Create an empty dictionary to store all info about day’s articles : 
dict_day = {'date_published':[ ],'date_updated':[ ],'time':[ ],
                       'Headline':[ ],'sector/category':[ ],'synopsis':[ ],'full_text':[ ]}

Create  an empty list to store article links:
 article_links = list() 

Initialize the user agent (web browser used to request information)
Create first BeautifulSoup object 
Extract date published from the  day’s page itself instead of repeatedly scraping it from each article one by one 
Do:
For each tag under 'ul',class='content':
Go into ‘li’ class
Go into ‘a’ class
If ‘href’ attribute is not null, then concatenate its content to the page url to give  the final page link
Append each link to ‘articles_links’
Until: All article links in the day page are appended 
Def transform(article_links):
Do: 
Record start time 
For each link in the list argument:
Record published date obtained in (5)
Create a soup object that parses the HTML link 
Try:
 to obtain headline, synopsis, date
updated, sector/category/full_text
			Except:
 Append ‘NA’ to the respective column
      9. Until: All links in ‘article_links’ are parsed 
     10. Convert dictionary to  dataframe 
     11. Save file with name as published date of articles
     12. Print dataframe 
     13. Record End time
     13. Return: time taken for operation

Observations:
Our previous challenge was to find patterns in full text class names so that we could envision building a uniform code for all articles for all 178 months. We came over this problem by implementing a ‘try/except’ block for two classes, namely ‘artData clr' (for non-ET prime articles) and ‘artData clr paywall' (for ET prime articles). We used indexing to remove unnecessary textual information/marketing text and append the final full_text to the dictionary. 

During this phase,  we found that  the processing time of one day’s articles was high. When this operation would be performed for a month (tiems 178), it would take very long to scrape all required articles for the project.

Conclusion:
So as to scrape 350-500 articles for each day of the month, we needed to fasten the process as each day’s articles downloading time was high at 15 mins. 
