## PDS Assignment

In [1]:
# Import required packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings

# To supress all warnings for visual clarity. Due to warnings resulting from using verifty = False in requests.get().
# For debugging comment out as this could hide crucial warnings. 
warnings.filterwarnings('ignore')

# Link for website main page
main_page_url = "https://sitescrape.awh.durham.ac.uk/comp42315/"

# Start on main page because first page of publications url might change if new topic of research alphabetically 
# before "Animation and Graphics" is added in the future. More robust to start from main page and find the provided 
# publications link from there. 
page = requests.get(main_page_url, verify = False)

soup = BeautifulSoup(page.content, "html.parser")

In [2]:
type(soup)

bs4.BeautifulSoup

In [3]:
# Finds all the <a> tags which contain "publicationfull_type_" in the href. This start of the relative file path is 
# the same for all topics in the various publication topic pages and the href for this tag will give the first topic 
# page alphabetically to start scraping from. 
publication_navigation_option = soup.find_all(name = "a", href = lambda href: href and "publicationfull_type_" in href)

# Select the one element of the publication_navigation_option list
publication_navigation_option = publication_navigation_option[0]

print(publication_navigation_option)

publication_navigation_relative_url = publication_navigation_option.get("href")

print(type(publication_navigation_relative_url))
print(publication_navigation_relative_url)

<a href="publicationfull_type_animationandgraphics.htm"><span>PUBLICATIONS</span><span>to innovate</span></a>
<class 'str'>
publicationfull_type_animationandgraphics.htm


In [4]:
# Concatinate main page url (global) and publication partial url (relative) to get global url for publications page
publication_navigation_full_url = main_page_url + publication_navigation_relative_url

page = requests.get(publication_navigation_full_url, verify = False)

soup = BeautifulSoup(page.content, "html.parser")

In [5]:
# Find all links in the publications page which follow the topic page format 
publication_page_elements = soup.find_all(name = "a", href = lambda href: href and "publicationfull_type_" in href)

publication_page_urls = []

for element in publication_page_elements:
    
    publication_page_urls.append(element.get("href"))
    
    print(element)


<a href="publicationfull_type_animationandgraphics.htm"><span>PUBLICATIONS</span><span>to innovate</span></a>
<a href="publicationfull_type_movementevaluation.htm">Movement Evaluation</a>
<a href="publicationfull_type_biomedicalengineering.htm">Biomedical Engineering</a>
<a href="publicationfull_type_interactionmodelling.htm">Interaction Modelling</a>
<a href="publicationfull_type_others.htm">Others</a>
<a href="publicationfull_type_actionrecognition.htm">Action Recognition</a>
<a href="publicationfull_type_depthand3destimation.htm">Depth and 3D Estimation</a>
<a href="publicationfull_type_virtualreality.htm">Virtual Reality</a>
<a href="publicationfull_type_facialfeatureanalysis.htm">Facial Feature Analysis</a>
<a href="publicationfull_type_3dsurfaces.htm">3D Surfaces</a>
<a href="publicationfull_type_crowdsimulation.htm">Crowd Simulation</a>
<a href="publicationfull_type_robotics.htm">Robotics</a>
<a href="publicationfull_type_handmodelling.htm">Hand Modelling</a>
<a href="publicatio

In [6]:
# Loop through all topic pages in publications and scrape 

publication_page_urls = publication_page_urls[:]

publications_dataframe = pd.DataFrame(columns = ["Publication Title", "Year", "Author List"])

for topic_relative_url in publication_page_urls:
    
    topic_global_url = main_page_url + topic_relative_url
    
    page = requests.get(topic_global_url, verify = False)
    
    soup = BeautifulSoup(page.content, "html.parser")
    
    publications = soup.find_all(name = "div", class_ = "w3-cell-row")
    
    publication_titles = []
    
    publication_years = []
    
    publication_authors = []
    
    for publication in publications:
        
        publication_title = publication.find_all(name = "span", class_ = "PublicationTitle")
        
        if (len(publication_title)) == 0:
            
            print("Error: Publication element has no title. Please investigate!")
            
        elif (len(publication_title)) > 1:
                
            print("Error: Publication element has more than one title. Please investigate!")
                
        else:
                
            publication_titles.append(publication_title[0].text)
            
        #######
            
        publication_TextSmalls = publication.find_all(name = "span", class_ = "TextSmall")
        
        if (len(publication_TextSmalls)) < 3:
            
            print("Error: Publication element has less tags of TextSmall class than anticipated. Please investigate!")
            
        elif (len(publication_title)) > 3:
                
            print("Error: Publication element has more tags of TextSmall class than anticipated. Please investigate!")
                
        else:
            
            # .rstrip() just in case there is a white space at end of text as opposed to year
            publication_years.append(publication_TextSmalls[0].text.rstrip()[-4:])
            
            publication_authors.append(publication_TextSmalls[1].text)
    
    #print(publication_titles)
    #print(publication_years)
    #print(publication_authors)
    
    topic_i_dataframe = pd.DataFrame({"Publication Title" : publication_titles, 
                                      "Year" : publication_years, 
                                      "Author List" : publication_authors})
    
    publications_dataframe = pd.concat([publications_dataframe, topic_i_dataframe], ignore_index = True)
    
    
print(publications_dataframe)
    

                                     Publication Title  Year  \
0    A Video-Based Augmented Reality System for Hum...  2023   
1    GAN-based Reactive Motion Synthesis with Class...  2022   
2    Spatio-temporal Manifold Learning for Human Mo...  2021   
3    A Quadruple Diffusion Convolutional Recurrent ...  2021   
4    A Generic Framework for Editing and Synthesizi...  2019   
..                                                 ...   ...   
216  Triplet Loss with Channel Attention for Person...  2019   
217  Less is More: Reducing Task and Model Complexi...  2023   
218  Enhancing Perception and Immersion in Pre-Capt...  2023   
219  Hierarchical Graph Convolutional Networks for ...  2023   
220  On the Design Fundamentals of Diffusion Models...  2023   

                                           Author List  
0    Kanglei Zhou, Ruizhi Cai, Yue Ma, Qingqing Tan...  
1    Qianhui Men, Hubert P. H. Shum, Edmond S. L. H...  
2    He Wang, Edmond S. L. Ho, Hubert P. H. Shum an...  
3  

In [7]:
# Create a column with the number of authors
publications_dataframe["Number of Authors"] = publications_dataframe["Author List"].str.replace(" and ", ", ").str.split(", ").apply(len)

print(publications_dataframe)

# Remove rows with duplicate publication titles
unique_publications_dataframe = publications_dataframe.drop_duplicates(subset = "Publication Title")

print(unique_publications_dataframe)

                                     Publication Title  Year  \
0    A Video-Based Augmented Reality System for Hum...  2023   
1    GAN-based Reactive Motion Synthesis with Class...  2022   
2    Spatio-temporal Manifold Learning for Human Mo...  2021   
3    A Quadruple Diffusion Convolutional Recurrent ...  2021   
4    A Generic Framework for Editing and Synthesizi...  2019   
..                                                 ...   ...   
216  Triplet Loss with Channel Attention for Person...  2019   
217  Less is More: Reducing Task and Model Complexi...  2023   
218  Enhancing Perception and Immersion in Pre-Capt...  2023   
219  Hierarchical Graph Convolutional Networks for ...  2023   
220  On the Design Fundamentals of Diffusion Models...  2023   

                                           Author List  Number of Authors  
0    Kanglei Zhou, Ruizhi Cai, Yue Ma, Qingqing Tan...                 10  
1    Qianhui Men, Hubert P. H. Shum, Edmond S. L. H...                  4  
2  