## Importing required library

In [9]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import random
import math
import os
import shutil
import sys

CRAWL_PROFILE = True
CRAWL_ARTICLE = True
pd.set_option('display.max_colwidth', None)


In [10]:
seed_url='https://pureportal.coventry.ac.uk/en/organisations/school-of-computing-mathematics-and-data-sciences/persons/'
# seed_url='https://pureportal.coventry.ac.uk/en/organisations/coventry-university/persons/'
base_url='https://pureportal.coventry.ac.uk/en'
articles_inputFile = "CoventryUni_Data_Scraped_articles.csv"

queue= [seed_url]
already_visited=[]
total_entries= 0
last_page_entries =0
profile_id = 1
document_id =1
data_table = {}
article_table = {}

crawl_date = time.ctime()
print(crawl_date)

Fri Jun  2 15:25:28 2023


# Save crawled data into CSV

In [11]:
#Write Profile Data into CSV file
def writeData_profile():
    global data_table
    col_names=['Profile_Name','Profile_Link','Designation']

    data_frame=pd.DataFrame.from_dict(data_table,orient ='index',columns=col_names)
    data_frame.index.rename('Profile_Id', inplace=True)
    
    file_exists = os.path.isfile("CoventryUni_Data_Scraped_profile.csv")
    
    print(file_exists)
    if file_exists: #backup the file
        shutil.copy("CoventryUni_Data_Scraped_profile.csv", "backup/CoventryUni_Data_Scraped_profile_last.csv")  
        
    data_frame.to_csv("CoventryUni_Data_Scraped_profile.csv")


#Write Article Data into CSV file
def writeData_articles():
    global article_table
    col_names=['Profile_Name','Title','Title_Link','Year_Published']

    data_frame=pd.DataFrame.from_dict(article_table,orient='index',columns=col_names)
    data_frame.index.rename('Document_Id', inplace=True)
    
    file_exists = os.path.isfile("CoventryUni_Data_Scraped_articles.csv")
    
    print(file_exists)
    if file_exists: #backup the file
        shutil.copy("CoventryUni_Data_Scraped_articles.csv", "backup/CoventryUni_Data_Scraped_articles_last.csv") 
        
    data_frame.to_csv("CoventryUni_Data_Scraped_articles.csv")    


# Extract Next page link

In [12]:
# returns link of next page
def extract_next_page_link(soup):
    global seed_url
    global total_entries
    #find the html tags for next page from html page
    next_link = soup.find('a', attrs={'class': 'nextLink'})
    href = next_link.get('href')
    page_number = href.split('=')[-1]

    # extract the keys for next page - 
    next_page_link = seed_url+f'?page={page_number}'

#     print(next_page_keys)
    return next_page_link

# Find Profile name link and designation from Profile Page

In [13]:
# Update profile information in the global data_table
def extract_info_from_page(page):
    global base_url
    global total_entries
    global queue
    global last_page_entries
    global data_table
    global profile_id

    soup = BeautifulSoup(page.content, 'html.parser')
    users= soup.find_all('li',attrs={'class':'grid-result-item'})
    # print("USERS-----------------------------", len(users))
    no_of_entries_in_page = 0
    for user in users:
        try:
          profile_name=user.find('h3',attrs={'class':'title'})
          # print("pname--------------", profile_name)
        except:
          profile_name = False
          pass
        try:
          link = user.find('a')['href']
          linkp = link+'/publications/'
          # print("link--------------", linkp)
        except:
          linkp = False
          pass
        try:
          designation = user.find('span',attrs={'class':'minor'}).text
          # print("designation--------------", designation)
        except:
          designation = False
          pass

        no_of_entries_in_page +=1
        
        data_table[profile_id] = [profile_name.text,linkp,designation]
        
        profile_id = profile_id+1
        
        # print("User Name: ",profile_name.text)
        # print("Link : ",base_url+link)
        # print("Designation :",designation.text)
        # print("\n")

    if last_page_entries == 0:
        last_page_entries = no_of_entries_in_page

    if no_of_entries_in_page < last_page_entries : # We have reached to last page
        total_entries=total_entries+no_of_entries_in_page
    else:    
        total_entries=total_entries+no_of_entries_in_page
        next_page= extract_next_page_link(soup) # still more pages left
        queue.append(next_page)
        last_page_entries = no_of_entries_in_page

# Extract individual Profile such as title research paper link and published year

In [14]:
# Input - Profile link
# output - update global article_table with all the details related with the profile 
def extract_user_info(profile_url):
    global article_table
    global document_id
    start = 0
#     page_size=20    
    #access url of profile 
    page =requests.get(profile_url)   
    
    if page.status_code != 200:
        print("Failed to access url..[ERROR_CODE]:", page.status_code)
        print("Page Url : ",profile_url)
        raise Exception("Error loading page..")

    else:
        #read the page HTML content
        soup = BeautifulSoup(page.content, 'html.parser')
        #find  all the articles on single page
        name = soup.find('div',attrs={'class':'person-details'})
        main_name = name.find('h1')
#         print(main_name)
        
        titles = soup.find_all('li',attrs={'class':'list-result-item'})
        
#         print(titles)
        print(len(titles))

#         title_len = len(titles)
    
#         while title_len > 0: # NOT NULL
        

#             title_len -= 1
        for title in titles:
            year_Published = title.find('span',attrs={'class':'date'}) 

            try:
                profile_name = main_name.text
#                 print("Profile: ",profile_name)
            except:
                profile_name = False
            pass
                
            try:
                title_name = title.find('a',attrs={'class': 'link'}).text
#                  print("Title: ",title_name)
            except:
                title_name = False
            pass

            try:
                title_link = title.find('a')['href']
#                  print("link: ",title_link)
            except:
                title_link = False
            pass                       
                
            try:
                year_published = year_Published.text.strip()
#                  print("Year published : ", year_published)
            except:
                year_published = False
            pass
                
                
            article_table[document_id] = [profile_name,title_name,title_link,year_published]

            document_id=document_id+1 # increment the document id for next item
            print("document_id", document_id)
            
#             start = start+page_size # show more pages of articles
            
#             if start != 100:
#                 page_size = 80
#             else:
#                 page_size = 100

            # new_page = requests.get(profile_url)
            # print("new page url: ", new_page)
            # soup = BeautifulSoup(new_page.content, 'html.parser')
            # titles = soup.find_all('li',attrs={'class':'list-result-item'})

            

In [15]:
# Test code request
page = requests.get(seed_url)
print(page.status_code)

200


## 1. Crawl and extract all the profile and paper related with coventry univertiy

In [17]:
stop = 500 # Use only for top level of scraping 
start_time = time.time() # measure time taken to scrape data 
failure=False

if CRAWL_PROFILE:
    #1.1 Crawl through all the pages and extract profile links into a list
    print("\n1.Crawling all the pages and extracting profile links ...")
    print("\nSeed URL : ",seed_url)
    
    while len(queue)!=0 and stop >0:
        try:
            random_time=random.randint(0,10) # genrate random time wait in sec
            #retrieve HTML content of the page
            print("url : ",queue[0])
            page =requests.get(queue[0])

            #extract all the requried info from the page
            if page.status_code==200:
                # print("page-found---------------before extract_info_from_page FUNCTION")
                extract_info_from_page(page)
            else:
                print("Failed to load page [ERROR_CODE]: ",page.status_code)
                print("Page Url : ",queue[0])

            #Try next link -> pop out the first page from main queue::FIFO and add into already visited url list
            if len(queue)>0 :
                already_visited.append(queue.pop(0))
            stop = stop-1
            print(".",end='') # print to show that scraping is in progress...
            time.sleep(random_time)  # add delay of 1s before visiting next page
        except:
            print("Inside level 1 crawling....something went wrong..Exiting\n")
            failure=True
            break

    stop_time = time.time()        
    if failure==False:
        time_taken = stop_time-start_time
        print("\nTime taken to scrape Profile data :",str(np.round(time_taken,4))+" sec")
        
        print("\nSaving all the extracted profiles and articles into csv file..")
        #Write profile data into file
        writeData_profile() 
        print("\nSaving Completed.")         
    else:
        time_taken = stop_time-start_time
        print("\nTime taken to scrape Profile data :",str(np.round(time_taken,4))+" sec")
        
        print("\nSaving all the extracted profiles and articles into csv file..")
        #Write profile data into file
        writeData_profile()         
        print("Level 1: Extraction of profiles failed...Saving extracted data so far. Program exit")


if CRAWL_ARTICLE:    
    start_time = time.time() # measure time taken to scrape data 
    #1.2 Crawl through all the profiles link now to extract articles for each users
    profile_df =pd.read_csv("CoventryUni_Data_Scraped_profile.csv",header='infer')
    profile_queue = profile_df['Profile_Link'].values # assign the profile_queue with list of all the pofile link ::FIFO
    head_profile_url = profile_queue[0]
    stop = len(profile_queue)

    print("\nn 2. Crawling all the Profile links and extracting articles info ...")
    print("\nFirst profile link : ",head_profile_url)
    print("\nQueue length : ",len(profile_queue))
    print("Stop Number: ", stop)

    while len(profile_queue) != 0 and stop >0:
        try:
            stop = stop-1
            random_time=random.randint(0,1) # genrate random time wait in sec
            
#             print("Before Extract Function")

               # Crawl to individual user profile and extract their published articles details
            extract_user_info(head_profile_url)
            print("Processsing done , pending ", len(profile_queue)-1)
                #pop out the first link from the queue and assign second link as first item
            if len(profile_queue)!=1:
                profile_queue = profile_queue[1:]
                #assign head of queue to head_profile_url for next crawl
                head_profile_url = profile_queue[0]
            else:
                profile_queue = [] # All the links are crawled

            print(len(profile_queue),end=',') # print to show that scraping is in progress...
            time.sleep(random_time)  # add delay of 1s before visiting next page
        except:
            print("Inside level 2 crawling....something went wrong..Exiting\n")
            failure = True
            break

    stop_time = time.time()
    if failure == False:
        time_taken = stop_time-start_time
        print("\nTime taken to scrape article data :",str(np.round(time_taken,4))+" sec")

        print("Nothing to process. Queue Len : ",len(queue))
        print("No.Of Entries extracted : ", total_entries)

        #Write published article data into file
        print("\nSaving all the extracted articles details into csv file..")    
        writeData_articles()
        print("\nSaving Completed.")  
    else:
        print("pages pending to crawl : ",len(profile_queue))
        time_taken = stop_time-start_time
        print("\nTime taken to scrape article data :",str(np.round(time_taken,4))+" sec")        
        writeData_articles()
        print("Level 2: Extraction of articles failed.. Saving crawled data so far.. Program exit")
    


1.Crawling all the pages and extracting profile links ...

Seed URL :  https://pureportal.coventry.ac.uk/en/organisations/school-of-computing-mathematics-and-data-sciences/persons/

Time taken to scrape Profile data : 0.0005 sec

Saving all the extracted profiles and articles into csv file..
True

Saving Completed.

n 2. Crawling all the Profile links and extracting articles info ...

First profile link :  https://pureportal.coventry.ac.uk/en/persons/mohamed-abdelshafy/publications/

Queue length :  62
Stop Number:  62
12
document_id 2
document_id 3
document_id 4
document_id 5
document_id 6
document_id 7
document_id 8
document_id 9
document_id 10
document_id 11
document_id 12
document_id 13
Processsing done , pending  61
61,3
document_id 14
document_id 15
document_id 16
Processsing done , pending  60
60,0
Processsing done , pending  59
59,31
document_id 17
document_id 18
document_id 19
document_id 20
document_id 21
document_id 22
document_id 23
document_id 24
document_id 25
document_i

26,0
Processsing done , pending  25
25,14
document_id 410
document_id 411
document_id 412
document_id 413
document_id 414
document_id 415
document_id 416
document_id 417
document_id 418
document_id 419
document_id 420
document_id 421
document_id 422
document_id 423
Processsing done , pending  24
24,0
Processsing done , pending  23
23,0
Processsing done , pending  22
22,25
document_id 424
document_id 425
document_id 426
document_id 427
document_id 428
document_id 429
document_id 430
document_id 431
document_id 432
document_id 433
document_id 434
document_id 435
document_id 436
document_id 437
document_id 438
document_id 439
document_id 440
document_id 441
document_id 442
document_id 443
document_id 444
document_id 445
document_id 446
document_id 447
document_id 448
Processsing done , pending  21
21,2
document_id 449
document_id 450
Processsing done , pending  20
20,0
Processsing done , pending  19
19,0
Processsing done , pending  18
18,0
Processsing done , pending  17
17,1
document_id 4