In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from scholarly import scholarly
import numpy as np
import concurrent.futures

In [2]:

# Read the CSV file
df = pd.read_csv('csrankings.csv')

# Define keywords for each field
system_keywords = ['system', 'architecture', 'hardware']
ai_keywords = ['artificial intelligence', 'machine learning', 'deep learning', 'computer vision']
theory_keywords = ['theory', 'algorithm', 'computational complexity']
interdisciplinary_keywords = ['interdisciplinary', 'cross-disciplinary', 'multi-disciplinary']

# Create columns for each field
df['System'] = False
df['AI'] = False
df['Theory'] = False
df['Interdisciplinary'] = False
df['Citations'] = 0


In [3]:
def check_keywords(url, keywords):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        interests = soup.find('div', class_='interests').text  # Replace with the actual HTML structure
        return any(keyword in interests.lower() for keyword in keywords)
    except Exception as e:
        print(f"Error accessing {url}: {e}")
        return False

def search_google_scholar(author_id, author_name):
    try:
        if author_id != 'NOSCHOLARPAGE':
            author = scholarly.search_author_id(author_id)
        else:
            search_query = scholarly.search_author(author_name)
            author = next(search_query)
        return author
    except Exception as e:
        print(f"Exception for author with ID {author_id}: {e}")
        return None

def process_chunk(chunk):
    for index, row in chunk.iterrows():
        author_id = str(row['scholarid'])  # Replace 'GoogleScholarID' with the actual column name
        author_name = str(row['name'])  # Replace 'AuthorName' with the actual column name
        author_info = search_google_scholar(author_id, author_name)

        if author_info:
            chunk.at[index, 'Citations'] = author_info.get('citedby', 0)

            interests = author_info.get('interests', [])
            chunk.at[index, 'System'] = any(keyword in interests for keyword in system_keywords)
            chunk.at[index, 'AI'] = any(keyword in interests for keyword in ai_keywords)
            chunk.at[index, 'Theory'] = any(keyword in interests for keyword in theory_keywords)
            chunk.at[index, 'Interdisciplinary'] = any(keyword in interests for keyword in interdisciplinary_keywords)

    return chunk

In [4]:
chunks = np.array_split(df, len(df) // 2000 + 1)

In [None]:

with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the processing function for each chunk
    futures = [executor.submit(process_chunk, chunk) for chunk in chunks]

    # Wait for all threads to finish
    concurrent.futures.wait(futures)

# Get the results from the completed threads
processed_chunks = [future.result() for future in futures]

# Concatenate the processed chunks back into a single DataFrame
df_processed = pd.concat(processed_chunks, ignore_index=True)

# Save the updated DataFrame to a new CSV file
df_processed.to_csv('./results/tagged_faculty_data_processed.csv', index=False)


In [11]:
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup

In [6]:
user_agent = UserAgent()
headers = {'User-Agent': user_agent.random}
url = f'https://scholar.google.com/citations?hl=en&user={"dEiLfCQAAAAJ"}'
response = requests.get(url, headers=headers)

In [8]:
print(response.content)

b'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n<html>\n<head><meta http-equiv="content-type" content="text/html; charset=utf-8"><meta name="viewport" content="initial-scale=1"><title>https://scholar.google.com/citations?hl=en&amp;user=dEiLfCQAAAAJ</title></head>\n<body style="font-family: arial, sans-serif; background-color: #fff; color: #000; padding:20px; font-size:18px; overscroll-behavior:contain;" onload="e=document.getElementById(\'captcha\');if(e){e.focus();} if(solveSimpleChallenge) {solveSimpleChallenge(,);}">\n<div style="max-width:400px;">\n<hr noshade size="1" style="color:#ccc; background-color:#ccc;"><br>\n<div style="font-size:13px;">\nOur systems have detected unusual traffic from your computer network.  Please try your request again later.  <a href="#" onclick="document.getElementById(\'infoDiv0\').style.display=\'block\';">Why did this happen?</a><br><br>\n<div id="infoDiv0" style="display:none; background-color:#eee; padding:10px; margin:0 0 15px 

In [12]:
soup = BeautifulSoup(response.content, 'html.parser')

In [13]:
cited_by_element = soup.find('div', {'id': 'gsc_prf_pua'})
cited_by = int(cited_by_element.text.strip()) if cited_by_element else 0

In [16]:
print (soup)

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>
<head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="initial-scale=1" name="viewport"/><title>https://scholar.google.com/citations?hl=en&amp;user=dEiLfCQAAAAJ</title></head>
<body onload="e=document.getElementById('captcha');if(e){e.focus();} if(solveSimpleChallenge) {solveSimpleChallenge(,);}" style="font-family: arial, sans-serif; background-color: #fff; color: #000; padding:20px; font-size:18px; overscroll-behavior:contain;">
<div style="max-width:400px;">
<hr noshade="" size="1" style="color:#ccc; background-color:#ccc;"/><br/>
<div style="font-size:13px;">
Our systems have detected unusual traffic from your computer network.  Please try your request again later.  <a href="#" onclick="document.getElementById('infoDiv0').style.display='block';">Why did this happen?</a><br/><br/>
<div id="infoDiv0" style="display:none; background-color:#eee; padding:10px; margin:0 0 15px 0; lin