# Web scraping application for University of Calgary - news page

## Author:   Anant Agarwal
## Date:     June 16, 2020

### Description: 
#### The intent of this script is to scrape data from blogs posted on University of Calgary news page and store them in a csv file on local disk. The user of this application will have to have choices on how the data should be scraped from the website
    1) Collect data from a fixed number of articles.
    2) Collect data from all the articles written after a fixed date. 

### Information provided by the user:  
    1) Number of articles to be scraped, or
    2) Date, scrape all the articles after that date, or
    3) Both

### Using the Application:
-   For running the script, User have to call the function getArticle and pass the date and number of blogs       to be scraped.
-   E.g. __getArticle('Jun 03, 2020', '20')__
-   If user does not have any values to pass, we can leave the entry blank. E.g. __getArticle('', '20').          Script will consider empty value as 'null'
-   Please enter the date in __mmm dd, yyyy__ format __(e.g. June 10, 2019)__.
-   If both values are passed as 'null' then the program will terminate with a message that no values were        entered.

### Result:
#### All the results will be stored in a uCal.csv file in the same directory where application is stored


__Importing required libraries__

In [None]:
import  pandas as pd
import  numpy  as np
import  datetime
from    urllib.request import urlopen as uReq
from    bs4 import BeautifulSoup as soup   

## Date function

In this function, we are asking user for date filter. 
* If user enters no value, 'null' value will be returned and function will print 'No date entered'
* If user give data in incorrect format, error message will be prompted and user will be requested to re-enter
* If user give data in correct format, date will be returned and function will print 'Date entered successfully' 


__Things to do__: Put a check on future dates

In [None]:

def getDate(user_date):
    global user_date_convert
    """
    Args: None
    Returns: Date value in datetime format or 'null'
    """
    # user_date = input("Enter the date (mmm dd, yyyy): ") #promting user to enter a date in given format

    #Checking if any value has been entered using 'len' command
    if (len(user_date) != 0):
        try:
            #Converting user value in datetime format, if successful value will be stored in user_date_convert
            user_date_convert = datetime.datetime.strptime(user_date, '%b %d, %Y')
            print("Date entered successfully")
            
        except ValueError:
            #If format of value entered is incorrect them print error message
            sys.exit("***ERROR: Incorrect Format, please enter date in mmm dd, yyyy format")

    else:
        #If no value entered, return 'null'
        user_date_convert = 'null'
        print ("No Date entered")
    
    return (user_date_convert)
    
    

## Number of blog function

In this function, we are asking user to provide number of blogs to be scraped
* If user enters no value, 'null' value will be returned and function will print 'No data entered'
* If user give data in incorrect format i.e. non-integral or non-positive value, error message will be prompted and user will be requested to re-enter
* If user give data in correct format, number of blogs will be returned and function will print 'Number of blogs value entered successfully' 

In [None]:
def getBlog_count(blog_count):
    global user_blog_count
    """
    Args: None
    Returns: Number of blogs value or 'null'
    """    
    #Checking if any value has been entered using 'len' command
    if (len(blog_count) != 0):
        try:
            #Converting user value in integar format, if successful value will be stored in user_blog_count
            user_blog_count = int(blog_count)

            #Checking if the integar value us non-positive, if yes, error message will prompt user to re-enter
            if(user_blog_count < 1):
                sys.exit("***ERROR: Cannot accept value less than 1, please re-enter")
            else:
                #Value is positive integar
                print("Number of blogs value entered successfully")
                # return (user_blog_count)
            
        except ValueError:
            #If the value is non-integral, prompt the error message
            sys.exit("***ERROR: Non-integral value entered, please re-enter")

    else:
        #If no value entered, return 'null'
        user_blog_count = 'null'
        print ("No Blog count entered")
    
    return (user_blog_count)


## Total number of pages in the website.

* In this, we have defined a function __getPage_count__, which will take URL of the website as an input and count the total number of pages.

* We look through the HTML code of the first page and find a __list__ with __class name: "pager_item pager_item--last"__

* Extracted the final page value from there and stored it in integar format.

* Returned the final page value

In [None]:
def getPage_count(my_url: str):
    """
    Args: my_url -> link of the University of Calgary - News website
    Returns: pages -> total number of pages in integar format
    """

    # Regular 'bs4' format.
    uClient = uReq(my_url) #Grabbing the webpage stored in my_url
    page_html = uClient.read() 
    uClient.close() #Close the web client after grabbing the data
    page_soup = soup(page_html, "html.parser") #Parsing the file in html format


    # 'pages' will store the final page value and returned it back.
    pages = int(page_soup.find('li', {'class':'pager__item pager__item--last'}).a["href"].replace("?search_api_fulltext=&page=",""))
    print(str(pages) + " pages counted successfully")
    return (pages)

## Function: Blog containers in individual page

In [None]:
def getBlog_containers(last_page, user_date, user_blog_count):
    """
    Args: URL of University of Calgary - News website
    Returns:
    """
    blog_count = 0
    my_url = 'https://www.ucalgary.ca/news/all-ucalgary-news' #University of Calgary - News website

    total_pages = range(last_page) # setting up range function from page 0 to last page
    
    #Taking one page at a time and defining all the blogs in it
    for current_page in total_pages:
        page_url = my_url + '?search_api_fulltext=&page=' + str(current_page)
        uClient = uReq(page_url) #Grabbing the webpage stored in mu_url
        page_html = uClient.read() 
        uClient.close() #Close the web client after grabbing the data
        page_soup = soup(page_html, "html.parser") #Parsing the file in html format

        #find the main container that stores all the blogs
        # blog_containers = page_soup.findAll('div',{'class':['news-teaser news-article-teaser', 'col-sm-12 col-md-6 news-items__main', 'col-sm-12 col-lg-4 news-item']})
        blog_containers = page_soup.findAll('div',{'class':['news-teaser news-article-teaser']})

        #Find the website link for individual blog page from each blog in the container
        for blog in blog_containers:

            blog_url = 'https://www.ucalgary.ca/' + blog.a["href"]
            blogClient = uReq(blog_url) #Grabbing the webpage stored in mu_url
            blog_html = blogClient.read() 
            blogClient.close() #Close the web client after grabbing the data
            blog_soup = soup(blog_html, "html.parser") #Parsing the file in html format
            blog_date = blog_soup.find('p', attrs = {'class':'title'}).text
            blog_date_convert = datetime.datetime.strptime(blog_date, '%B %d, %Y') # Converted date from string to datetime format
            dublicate_url_check = checkDublicate(blog_url)
            
            if (dublicate_url_check == False):                              
                
                blog_title = blog_soup.find('h1', attrs = {'class':'head'}).text
                blog_subtitle = blog_soup.find('h4', attrs = {'class':'deck'}).text
                blog_body = blog_soup.findAll('div', {'class':'body'})[0].text.replace("\n","").replace("\xad—\xa0","").replace("—\xa0","").replace("\xa0","").replace("\xad","")            
                
                if(user_date != 'null'):
                    if(blog_date_convert < user_date_convert):
                        print("User provided date reached")
                        return

                if(user_blog_count != 'null'):
                    if(blog_count >= user_blog_count):
                        print("Number of blogs reached")
                        return
                
                writeData(blog_count+1, blog_title, blog_subtitle, blog_date, blog_url, blog_body)
            
            else:
                print("Blog " + str(blog_count+1) + " already written")
                
                if(user_date != 'null'):
                    if(blog_date_convert < user_date_convert):
                        print("User provided date reached")
                        return

                if(user_blog_count != 'null'):
                    if(blog_count+1 >= user_blog_count):
                        print("Number of blogs reached")
                        return

            #Increment blog counter after scraping 1 blog
            blog_count += 1
        


            

## Checking the dublicate entry

In [None]:
# Checking the entry
def checkDublicate(blog_url):
    """
    Args: URL of University of Calgary - News website
    Returns:
    """

    colnames = ['blog_count', 'blog_title', 'blog_subtitle', 'blog_date', 'blog_url', 'blog_body']
    data = pd.read_csv('UCal_csv.csv', names=colnames,  engine = 'python')[1:]
    url_list = data.blog_url.tolist()
    if (blog_url in url_list):
        return (True)
    else:
        return(False)
    


## Structuring the csv

In [None]:
def structureCSV():
    """
    Args: 
    Returns:
    """
    try:
        # colnames = ['blog_count', 'blog_title', 'blog_subtitle', 'blog_date', 'blog_url', 'blog_body']
        # pd.read_csv('UCal_csv.csv','rU'), encoding='utf-8', engine='c', names=colnames)[1:]
        pd.read_csv('UCal_csv.csv', engine = 'python')[1:]
        print("Old csv file opened successfully")
        return
    except FileNotFoundError:

        filename = 'UCal_csv.csv' #File name to store the scraped data
        f = open(filename, 'w', encoding='utf-8')

        #Defining header for csv
        headers = 'Blog No., Blog Title, Blog Subtitle, Blog Publish Data, Blog Link, Blog Content\n'
        f.write(headers)
        f.close()
        print("New csv file opened successfully")


    filename = 'UCal_txt.txt' #File name to store the scraped data
    a = open(filename, 'a', encoding='utf-8')

    return


## Write function

In [None]:
def writeData(blog_count, blog_title, blog_subtitle, blog_date, blog_url, blog_body):
    """
    Args: 
    Returns:
    """
    filename = 'UCal_csv.csv' #File name to store the scraped data
    f = open(filename, 'a', encoding='utf-8')

    f.write(str(blog_count).replace(",", "")     + "," + 
                blog_title.replace(",", "")      + "," +
                blog_subtitle.replace(",", "")   + "," +
                blog_date.replace(",", "")       + "," +
                blog_url.replace(",", "")        + "," +
                blog_body.replace(",", "")       + "\n")

    f.close()
    
    filename = 'UCal_txt.txt' #File name to store the scraped data
    a = open(filename, 'a', encoding='utf-8')

    a.write("Blog No.:"         + str(blog_count)   + "\n" + 
            "Blog Title: "      + blog_title        + "\n" + 
            "Blog Subtitle: "   + blog_subtitle     + "\n" +
            "Publish Date: "    + blog_date         + "\n" + 
            "Blog link: "       + blog_url          + "\n" + 
            "Content: "         + str(blog_body)    + "\n" + "\n")
    a.close()

    print("Blog " + str(blog_count) + " written successfully")
    return

# Combine Function

### In this function, application will ask the user to enter the choice for scraping the blogs from the website
1. If the user want to scrape data from a set number of blogs 
2. If the user wants to scrape data from a given date

#### Based on the input from the user, appropriate function will be called:
- getData_number
- getData_date

In [None]:
def getArticle(date, blog_count):
    
    my_url = 'https://www.ucalgary.ca/news/all-ucalgary-news' #University of Calgary - News website
    user_date = getDate(date)
    user_blog_count = getBlog_count(blog_count)
    if(user_date == 'null' and user_blog_count == 'null'):
        print ("Program terminated as no Date or Blog count provided")
    else:
        last_page = getPage_count(my_url)
        structureCSV()
        container_data = getBlog_containers(last_page, user_date, user_blog_count)


In [None]:
getArticle('', '10')