In [1]:
# Full Name
# Email
# DR-NTU URL
# Website URL
# DBLP URL
# Citations (All) 
# %pip install fuzzywuzzy
from fuzzywuzzy import process
# %pip install scholarly
from urllib.error import HTTPError
import requests
import re
from bs4 import BeautifulSoup
from typing import Dict
import pandas as pd
from urllib.parse import quote_plus
import logging
from scholarly import scholarly
from typing import Tuple,List
import numpy as np


# CONFIG
ORIGINAL_URL = "https://dr.ntu.edu.sg"  

def get_section_links(div_id: str, url: str, original_url: str = ORIGINAL_URL) -> Dict[str, str]:
    """
    Scrapes section links from a given URL and returns it as a dictionary.

    Parameters:
    - div_id (str): The ID of the div containing the section links
    - url (str): URL to scrape

    Returns:
    - dict: Dictionary containing the scraped section links
    """
    curr_url = original_url+url
    output = {}
    with requests.Session() as session:
        while curr_url:
            try:
                response = session.get(curr_url, timeout=10)
                response.raise_for_status()
            except (HTTPError) as e:
                print(f"An error occurred: {e}")
                return output  

            soup = BeautifulSoup(response.text, 'html.parser')

            try:
                section = soup.find('div', {'id': div_id})
                if section is None:
                    raise ValueError(f"Could not find {div_id} section")

                ls = section.find_all('li', {'class': 'list-group-item'})
                for item in ls:
                    text = item.find('a').text.strip()
                    if re.search(r'(next\s*>\s*)|(<\s*previous)', text, re.IGNORECASE):
                        break
                    
                    item_name = item.find('a')["title"].replace("Filter by ","")
                    item_url = item.find('a')['href']
                    output[item_name] = item_url

                next_button = section.find('a', string=re.compile(r'next\s*>\s*'))
                curr_url = original_url+url+next_button['href'] if next_button else None
            except Exception as e:
                print(f"An error occurred while parsing the section: {e}")
                curr_url = None

    return output

def get_publications(url:str,session:requests.Session)->List[str]:
    
    with requests.Session() as session:
        try:
            response = session.get(url+"/selectedPublications.html", timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')            
            div_content = soup.find('div', {'id': 'facultyjournalDiv'})
            
            if div_content == None:
                return []
        except requests.RequestException as e:
            print(f"An error occurred: {e}")
            return []
        try:
            entries = div_content.div.get_text(separator='<br/><br/>').split('<br/><br/>')
            entries = [entry.replace('<div>', '').replace('</div>', '').strip() for entry in entries if entry.strip()]
            entries = [entry for entry in entries if entry!= 'Highly Cited:']
            return entries
        
        except AttributeError as e:
            print(f"Error while parsing pagination. Stopping.{e}")
            return []

    

def get_website(url:str,session:requests.Session)-> str: #website
    with requests.Session() as session:
        try:
            response = session.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            section = soup.find('div', {'id': 'personalsiteDiv'})
        
            if section == None:
                return None
        except requests.RequestException as e:
            print(f"An error occurred: {e}")
            return None
        try:
            for anchor in section.find_all('a'):
                field_name = anchor.find('span').text.strip()
                field_value = anchor['href']
                if field_value:
                    break
                    
            return field_value
        
        except AttributeError as e:
            print(f"Error while parsing pagination. Stopping.{e}")
            return None


def generate_name_variants(full_name: str) -> list:
    
    name_parts = full_name.split()
    first_name = name_parts[0]
    last_name = name_parts[-1]
    middle_names = name_parts[1:-1]
    middle_initials = [name[0] for name in middle_names]
    variants = []
    variants.append(full_name)
    if middle_names:
        variants.append(f"{first_name} {' '.join(middle_initials)}. {last_name}")
        variants.append(f"{first_name[0]}. {' '.join(middle_initials)}. {last_name}")
        variants.append(f"{last_name}, {first_name[0]}. {' '.join(middle_initials)}.")
    variants.append(f"{first_name} {last_name}")
    variants.append(f"{last_name}, {first_name}")
    variants.append(f"{first_name[0]}. {last_name}")
    variants.append(f"{last_name}, {first_name[0]}.")
        
    return variants

def clean_up_func(df:pd.DataFrame)->pd.DataFrame:
    df['DBLP_URL'] = np.where(df['DBLP_URL_PUBL'].isna(), df['DBLP URL'], df['DBLP_URL_PUBL'])
    df.loc[df['Full Name']== "Tay Kian Boon", 'DBLP_URL'] = None
    
    
    df.loc[df['Full Name']== "Joty Shafiq Rayhan", 'Citations(All)']=8359.0 
    df.loc[df['Full Name']== "Ke Yiping, Kelly", 'Citations(All)']=3623.0
    df.loc[df['Full Name']== "Lin Weisi", 'Citations(All)']=27246.0
    df.loc[df['Full Name']== "Luke Ong （翁之昊）", 'Citations(All)']=5931.0
    df.loc[df['Full Name']== "Ong Yew Soon", 'Citations(All)']=23747.0
    df.loc[df['Full Name']== "Pan, Sinno Jialin", 'Citations(All)']=37457.0
    df.loc[df['Full Name']== "Quek Hiok Chai", 'Citations(All)']=8315.0
    df.loc[df['Full Name']== "Zhao Jun", 'Citations(All)']=6983.0
    df.loc[df['Full Name']== "Zinovi Rabinovich", 'Citations(All)']=1126.0
    
    return df[["Full Name","Email","DR_NTU_URL","Website_URL","DBLP_URL","Citations(All)"]]

def search_dblp_publ(professor_name: str,publications:List[str],session: requests.Session) -> str:
    base_url = "https://dblp.org/search/plub"
    for publication in publications:
        match = re.match(r'^(.*?\(\d{4}\))', publication)

        if match:
            authors_and_year = match.group(1)
        else:
            continue
        
        encoded_name = quote_plus(authors_and_year)

        search_url = f"{base_url}?q={encoded_name}"
        response = session.get(search_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        section = soup.find('ul',{'class': 'publ-list'})
        author_data = {}
        for author_element in soup.select('span[itemprop="author"]'):
            name = author_element.find('span', itemprop="name").text
            href = author_element.find('a', itemprop="url")['href']
            author_data[name] = href
        
        if len(author_data)==0:
                continue
        elif len(author_data)!=0:
            best_match, score = process.extractOne(professor_name, author_data.keys())
            if score < 70:
#                 print("score too low:", best_match)
                continue
            else:
                matched_href = author_data[best_match]
                return matched_href
            
    return None

def search_dblp(professor_name: str, session: requests.Session) -> str:
    base_url = "https://dblp.org/search"
    variants = generate_name_variants(professor_name)
    retry_count = len(variants)
    for attempt in range(retry_count):
        encoded_name = quote_plus(variants[attempt])
        search_url = f"{base_url}?q={encoded_name}"
        try:
            response = session.get(search_url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            section = soup.find('div', {'id': 'completesearch-authors'})
        
            result_list = section.find_all('li', {'itemscope': '','itemtype': 'http://schema.org/Person'})
            if result_list == None:
                continue
            
            for item in result_list:
                name_tag = item.find('span', {'itemprop': 'name'})
                university_tag = item.find('small')
                href_tag = item.find('a', {'itemprop': 'url'})
                if university_tag.text.strip()!="":
                    pass

                if name_tag and href_tag:
                    return href_tag['href'] if href_tag['href'] else None
                    
        except Exception as e:
            logging.critical(f"An unexpected error occurred: {e}. Stopping.")
            return None

    logging.error(f"Failed to find dblp link for {professor_name} after {retry_count} attempts.")
    return None

def get_citations(professor_name:str):
    ntu_strings = ["ntu","nanyang technological university"]
    variants = generate_name_variants(professor_name)
    retry_count = len(variants)
    for attempt in range(retry_count):
        if attempt>2:
            break
        try:
            search_query = scholarly.search_author(variants[attempt])
            for author in search_query:
                if 'affiliation' in author:
                    res = any(ele in author['affiliation'].lower() for ele in ntu_strings)
                    if not res:
#                         print(author['affiliation'])
                        continue                        
                if 'citedby' in author:
                    return author['citedby']
                else:
                    print(f"No citation data found for {professor_name}")
        except StopIteration:
             print(f"No data found for {professor_name}")
    
    return None
    

def get_professor_data(fac: str, url: str, original_url: str = ORIGINAL_URL) -> pd.DataFrame:
    """
    Scrapes professor data from a given URL and returns it as a DataFrame.

    Parameters:
    - fac (str): Faculty name
    - url (str): URL to scrape
    - original_url (str): The base URL

    Returns:
    - pd.DataFrame: DataFrame containing the scraped data
    """
    curr_url = original_url + url
    res_df = pd.DataFrame()

    with requests.Session() as session:
        while curr_url:
            try:
                response = session.get(curr_url, timeout=10)
                response.raise_for_status()
            except requests.RequestException as e:
                print(f"An error occurred: {e}")
                return res_df

            soup = BeautifulSoup(response.text, 'html.parser')

            try:
                pagination = soup.find('ul', {'class': 'pagination'})
                next_page = pagination.find('li', string='next')
                next_page_a = next_page.find('a') if next_page else None
                curr_url = original_url + next_page_a['href'] if next_page_a else None
            except AttributeError:
                print("Error while parsing pagination. Stopping.")
                curr_url = None

            try:
                table = soup.find('table', {'class': 'table'})
                df = pd.read_html(str(table))[0]
                rows = table.find_all('tr')[1:]
                dr_ntu_urls = [original_url + row.find('a')['href'] for row in rows]
                df['DR_NTU_URL'] = dr_ntu_urls
                res_df = pd.concat([res_df, df], ignore_index=True)
                res_df["Faculty"] = fac
                
            except Exception as e:
                print(f"An error occurred while parsing the table: {e}")
            

            
        res_df["Website_URL"]= res_df.apply(lambda row : get_website(row['DR_NTU_URL'],session), axis = 1)
        res_df["Publications"]= res_df.apply(lambda row : get_publications(row['DR_NTU_URL'],session), axis = 1)

        res_df["DBLP_URL_PUBL"]= res_df.apply(lambda row : search_dblp_publ(row['Full Name'],row['Publications'],session), axis = 1)
        res_df["DBLP URL"]= res_df.apply(lambda row : search_dblp(row['Full Name'],session), axis = 1)

        res_df["Citations(All)"] = res_df.apply(lambda row : get_citations(row['Full Name']), axis = 1)
        
        res_df = clean_up_func(res_df)



    return res_df




In [4]:
original_df = pd.read_csv("Cheam_Zhong_Wei_Caleb.csv")
original_df

Unnamed: 0,Full Name,Email,DR_NTU_URL,Website_URL,DBLP_URL,Citations(All)
0,A S Madhukumar,asmadhukumar@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00083,http://www3.ntu.edu.sg/home/asmadhukumar/,https://dblp.org/pid/66/549.html,2907.0
1,Alexei Sourin,assourin@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00274,http://www3.ntu.edu.sg/home/assourin/,https://dblp.org/pid/15/3108.html,2939.0
2,Anupam Chattopadhyay,anupam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01076,https://scholar.google.co.in/citations?user=TI...,https://dblp.org/pid/99/4535.html,6226.0
3,Anwitaman Datta,anwitaman@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00706,https://personal.ntu.edu.sg/anwitaman/,https://dblp.org/pid/d/AnwitamanDatta.html,8047.0
4,Arvind Easwaran,arvinde@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00687,https://cps-research-group.github.io/,https://dblp.org/pid/73/1708.html,2817.0
...,...,...,...,...,...,...
81,Zhang Jie,zhangj@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00759,https://personal.ntu.edu.sg/zhangj/,https://dblp.org/pid/84/6889-2.html,12227.0
82,Zhang Tianwei,tianwei.zhang@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00562,https://personal.ntu.edu.sg/tianwei.zhang/,https://dblp.org/pid/77/7902-4.html,2695.0
83,Zhao Jun,junzhao@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00501,http://junzhaogroupntu.github.io,https://dblp.org/pid/47/2026-1.html,6983.0
84,Zheng Jianmin,asjmzheng@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00072,https://personal.ntu.edu.sg/asjmzheng,https://dblp.org/pid/09/5452.html,7216.0


# Getting each professor description, biography, keywords, research interests

In [38]:
def get_description(url: str, session: requests.Session) -> pd.DataFrame:
    try:
        response = session.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser') 
        biography_div = soup.find('div', {'id': 'biographyDiv'})
        research_div = soup.find('div', {'id': 'researchinterestsDiv'})
        grants_div = soup.find('div', {'id': 'currentgrantsDiv'})
        keywords_div = soup.find('div', {'id': 'researchkeywords'})
        
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()

    try:
        keywords = [span.get_text(strip=True) for span in keywords_div.find_all('span', {'class': 'rkeyword'})] if keywords_div!=None else []
        grants = [li.get_text() for li in grants_div.find_all('li')] if grants_div !=None else []
        biography = biography_div.get_text(strip=True)
        research = research_div.get_text(strip=True) if research_div != None else ""
        
    except AttributeError as e:
        print(f"Error while parsing. {e}")
        print(url)
        return pd.DataFrame()

    data = {
        "biography": [biography],
        "research": [research],
        "grants": [", ".join(grants)],
        "keywords": [", ".join(keywords)],
        "DR_NTU_URL":[url]
    }
    return pd.DataFrame(data)

prof_df = pd.DataFrame()
with requests.Session() as session:
    for url in original_df["DR_NTU_URL"].tolist():
        new_df = get_description(url,session)
        prof_df = pd.concat([prof_df,new_df])

df = prof_df.reset_index(drop = True)

df.to_csv("prof_data.csv",index=False)

## Getting scholarly id in order to extract publication time series and citations data

In [43]:
def get_author_id(professor_name:str):
    ntu_strings = ["ntu","nanyang technological university"]
    variants = generate_name_variants(professor_name)
    retry_count = len(variants)
    
    for attempt in range(retry_count):
        if attempt > 2:
            break
        try:
            search_query = scholarly.search_author(variants[attempt])
            for author in search_query:
                if 'affiliation' in author:
                    res = any(ele in author['affiliation'].lower() for ele in ntu_strings)
                    if not res:
                        continue                        
                if 'citedby' in author:
                    return author['scholar_id']
                else:
                    print(f"No citation data found for {professor_name}")
        except StopIteration:
             print(f"No data found for {professor_name}")
    
    return None


original_df["scholar_id"] = original_df.apply(lambda row : get_author_id(row['Full Name']), axis = 1)
original_df

Unnamed: 0,Full Name,Email,DR_NTU_URL,Website_URL,DBLP_URL,Citations(All),scholar_id
0,A S Madhukumar,asmadhukumar@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00083,http://www3.ntu.edu.sg/home/asmadhukumar/,https://dblp.org/pid/66/549.html,2907.0,7_AzrLwAAAAJ
1,Alexei Sourin,assourin@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00274,http://www3.ntu.edu.sg/home/assourin/,https://dblp.org/pid/15/3108.html,2939.0,8A7kHCYAAAAJ
2,Anupam Chattopadhyay,anupam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01076,https://scholar.google.co.in/citations?user=TI...,https://dblp.org/pid/99/4535.html,6226.0,TIt4ggwAAAAJ
3,Anwitaman Datta,anwitaman@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00706,https://personal.ntu.edu.sg/anwitaman/,https://dblp.org/pid/d/AnwitamanDatta.html,8047.0,VWi3_OIAAAAJ
4,Arvind Easwaran,arvinde@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00687,https://cps-research-group.github.io/,https://dblp.org/pid/73/1708.html,2817.0,B_ouhTgAAAAJ
...,...,...,...,...,...,...,...
81,Zhang Jie,zhangj@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00759,https://personal.ntu.edu.sg/zhangj/,https://dblp.org/pid/84/6889-2.html,12227.0,IFV_RdMAAAAJ
82,Zhang Tianwei,tianwei.zhang@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00562,https://personal.ntu.edu.sg/tianwei.zhang/,https://dblp.org/pid/77/7902-4.html,2695.0,9vpiYDIAAAAJ
83,Zhao Jun,junzhao@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00501,http://junzhaogroupntu.github.io,https://dblp.org/pid/47/2026-1.html,6983.0,
84,Zheng Jianmin,asjmzheng@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00072,https://personal.ntu.edu.sg/asjmzheng,https://dblp.org/pid/09/5452.html,7216.0,sGCf2k0AAAAJ


In [44]:
original_df.to_csv("original_data.csv",index=False)

In [60]:
from scholarly import scholarly
import pandas as pd

def get_author_data(scholar_id:str)->pd.DataFrame:
    ntu_strings = ["ntu","nanyang technological university"]
    
    author = scholarly.search_author_id(scholar_id)
    filled_author = scholarly.fill(author)
    
    citations_per_year = filled_author['cites_per_year']
    
    citation_df = pd.DataFrame(list(citations_per_year.items()), columns=['Year', 'Citations'])
    
    citation_data = citation_df.to_dict()

    publications = filled_author['publications']
        
    publication_data = []
    n = min(len(publications),30)
    for pub in publications[:n]:
        publication_data.append({
            'Title': pub['bib']['title'],
            'Year': pub['bib']['pub_year'] if pub['bib'].get('pub_year')!=None else None,
            'Citations': pub['num_citations'],
            'Venue': pub['bib']['venue'] if pub['bib'].get('venue') is not None else None,

        })
    publication_df = pd.DataFrame(publication_data)
    
    publications_data = publication_df.to_dict()
    
    image_url = filled_author['url_picture'] if filled_author.get('url_picture')!=None else None
    
    outside_ntu = set()
    
    co_authors = filled_author['coauthors']
    ntu_affiliated = 0
    non_ntu_affiliated = 0
    for co_author in co_authors:
        res = any(ele in co_author['affiliation'].lower() for ele in ntu_strings)
        if res:
            ntu_affiliated += 1
        else:
            non_ntu_affiliated += 1
            outside_ntu.add(co_author['affiliation'].lower())
            
    data = {
        'NTU_Affiliated_Coauthors': [ntu_affiliated],
        'Non_Ntu_Affiliated_Coauthors': [non_ntu_affiliated],
        'image_url': [image_url],
        'publication_data': [publications_data],
        'citation_data':[citation_data],
        'scholar_id':[scholar_id],
        'non_ntu_affliations':[outside_ntu]
    }
    
    return pd.DataFrame(data)

# Example usage
google_scholar = pd.DataFrame()
for id in original_df['scholar_id'].dropna().tolist():
    new_df = get_author_data(id)
    google_scholar = pd.concat([google_scholar,new_df],axis = 0)
    
google_scholar = google_scholar.reset_index(drop=True)
google_scholar


Unnamed: 0,NTU_Affiliated_Coauthors,Non_Ntu_Affiliated_Coauthors,image_url,publication_data,citation_data,scholar_id,non_ntu_affliations
0,7,30,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'CDMA system with frequency doma...,"{'Year': {0: 2002, 1: 2003, 2: 2004, 3: 2005, ...",7_AzrLwAAAAJ,"{, cmr institute of technology, institute of i..."
1,3,6,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'Function representation in geom...,"{'Year': {0: 1995, 1: 1996, 2: 1997, 3: 1998, ...",8A7kHCYAAAAJ,{department of fluid mechanics & thermodynamic...
2,15,121,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'Adversarial attacks and defence...,"{'Year': {0: 2005, 1: 2006, 2: 2007, 3: 2008, ...",TIt4ggwAAAAJ,"{, national university of singapore, professor..."
3,6,64,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'Edge-centric computing: Vision ...,"{'Year': {0: 2003, 1: 2004, 2: 2005, 3: 2006, ...",VWi3_OIAAAAJ,"{institute of informatics, university of warsa..."
4,0,0,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'Compositional analysis framewor...,"{'Year': {0: 2006, 1: 2007, 2: 2008, 3: 2009, ...",B_ouhTgAAAAJ,{}
...,...,...,...,...,...,...,...
64,57,104,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'Advances and open problems in f...,"{'Year': {0: 2011, 1: 2012, 2: 2013, 3: 2014, ...",eXgoTXMAAAAJ,"{amazon, victoria university, senior researche..."
65,8,83,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'Neural collaborative filtering'...,"{'Year': {0: 2015, 1: 2016, 2: 2017, 3: 2018, ...",YG0DFyYAAAAJ,{university of science and technology of china...
66,13,67,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'TrustSVD: Collaborative Filteri...,"{'Year': {0: 2008, 1: 2009, 2: 2010, 3: 2011, ...",IFV_RdMAAAAJ,"{, national university of singapore, micron, h..."
67,0,0,https://scholar.googleusercontent.com/citation...,{'Title': {0: 'Cloudradar: A real-time side-ch...,"{'Year': {0: 2015, 1: 2016, 2: 2017, 3: 2018, ...",9vpiYDIAAAAJ,{}


In [61]:
google_scholar.to_csv('google_scholar_data.csv',index=False)