In [1]:
# Week 1 - Web Scraping Q1

from google.colab import drive
import requests
from bs4 import BeautifulSoup
import re

# Mount Google Drive
drive.mount('/content/drive')

def clean_name(name):
    # Clean name - remove special characters and unnecessary whitespace
    name = re.sub(r'[,\n\t\r]', '', name)
    name = re.sub(r'\s+', ' ', name)
    name = name.strip()

    # Remove <u> tags
    name = re.sub(r'</?u>', '', name)

    # Exclude names that are too short or contain digits
    if len(name) < 2 or bool(re.search(r'\d', name)):
        return None

    return name

def extract_names_from_text(text):
    # Extract names from text
    # Create an empty list to hold names
    name_list = []

    # Handle authors list separated by commas
    if ',' in text:
        author_list = text.split(',')
        for author in author_list:
            name = clean_name(author)
            if name:
                name_list.append(name)
    else:
        name = clean_name(text)
        if name:
            name_list.append(name)

    return name_list

def parse_html_for_names(html_content):
    # Extract all researcher names from HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    # Remove duplicates
    name_list_to_set = set()

    # Find all list items containing presentation titles and author information
    for li in soup.find_all('li'):
        text = li.get_text()
        if not text:
            continue

        # Find <i> tags containing author information
        authors_tag = li.find('i')
        if authors_tag:
            authors_text = authors_tag.get_text()
            name_list = extract_names_from_text(authors_text)
            name_list_to_set.update(name_list)

    # Find session chairs
    chair_patterns = soup.find_all(string=re.compile(r'Chair:', re.IGNORECASE))
    for pattern in chair_patterns:
        chair_text = pattern.strip()
        if 'Chair:' in chair_text:
            chair_name = chair_text.split('Chair:')[1]
            name = clean_name(chair_name)
            if name:
                name_list_to_set.add(name)

    return sorted(list(name_list_to_set))

def main():
    # Set HTML file path (Google Drive path)
    file_path = '/content/drive/MyDrive/IC2S2_2023.html'
    # Alternatively, URL can be used directly

    # Read HTML file
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # Extract names
    names = parse_html_for_names(html_content)

    # Save results to file
    output_path = '/content/drive/MyDrive/ic2s2_2023_researchers.txt'
    df_path = '/content/drive/MyDrive/ic2s2_2023_researchers.csv'

    with open(output_path, 'w', encoding='utf-8') as f:
        for name in names:
            f.write(name + '\n')

    print(f"A total of {len(names)} unique researcher names have been extracted.")
    print(f"Results have been saved to {output_path} and {df_path}.")

if __name__ == "__main__":
    main()


Mounted at /content/drive
A total of 1484 unique researcher names have been extracted.
Results have been saved to /content/drive/MyDrive/ic2s2_2023_researchers.txt and /content/drive/MyDrive/ic2s2_2023_researchers.csv.


In [2]:
# Q2

!pip install fuzzywuzzy[speedup]

import pandas as pd
import re
from fuzzywuzzy import fuzz
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

def extract_person_from_candidate(candidate):
    """
    Since each cell entry is in the format "Person Name, Other Information",
    extract the text before the first comma as the person's name.
    If an affiliation in parentheses is present, remove the parentheses and its content.

    Example:
      "Chris Kempes, ..."                  -> "Chris Kempes"
      "Chris Kempes (Santa Fe Institute)"  -> "Chris Kempes"
    """
    candidate = candidate.strip()
    # If there is a comma, use the text before the first comma; otherwise, use the entire text
    if ',' in candidate:
        person = candidate.split(',', 1)[0]
    else:
        person = candidate
    # Remove parentheses and the text within
    person = re.sub(r'\s*\(.*?\)', '', person)
    return person.strip()

def extract_names_from_cell(cell):
    """
    Assumes the cell text is in the format "Person Name, Other Information; Person Name, Other Information; ..."
    Splits the string by semicolons (;) and applies extract_person_from_candidate on each entry.
    Returns only those names that consist of two or more words.
    """
    names = []
    parts = cell.split(';')
    for part in parts:
        part = part.strip()
        if not part:
            continue
        person_name = extract_person_from_candidate(part)
        if len(person_name.split()) >= 2:
            names.append(person_name)
    return names

def extract_names_from_df(df, column_name):
    """
    From the specified column (e.g., 'Poster authors' or 'Presentation authors') of the given DataFrame,
    apply extract_names_from_cell() to extract all candidate person names.
    """
    authors = []
    if column_name in df.columns:
        for entry in df[column_name].dropna():
            authors.extend(extract_names_from_cell(entry))
    else:
        print(f"Column '{column_name}' does not exist. Available columns: {df.columns.tolist()}")
    return authors

def cluster_names(names, threshold=90):
    """
    Uses fuzzywuzzy's token_sort_ratio to group names that have a similarity score above the threshold,
    considering them as the same individual. Within each cluster, the shortest (cleanest) version of the name
    is selected as the representative.

    Returns:
      representative_names: Final list of unique person names
      clusters: List of names for each cluster (for debugging)
    """
    names_list = list(set(names))
    clusters = []
    used = set()
    for i, name in enumerate(names_list):
        if name in used:
            continue
        cluster = [name]
        used.add(name)
        for other in names_list[i+1:]:
            if other in used:
                continue
            score = fuzz.token_sort_ratio(name, other)
            if score >= threshold:
                cluster.append(other)
                used.add(other)
        clusters.append(cluster)
    representative_names = [min(cluster, key=len) for cluster in clusters]
    return representative_names, clusters

def main():
    # Set CSV file paths (using actual Google Drive paths)
    poster_csv    = '/content/drive/MyDrive/IC2S2_2024_posters.csv'
    lightning_csv = '/content/drive/MyDrive/IC2S2_2024_lightning_talks.csv'
    orals_csv     = '/content/drive/MyDrive/IC2S2_2024_oral_panels.csv'

    # Read CSV files
    posters_df   = pd.read_csv(poster_csv)
    lightning_df = pd.read_csv(lightning_csv)
    orals_df     = pd.read_csv(orals_csv)

    # Extract candidate names from the author columns of each DataFrame
    poster_authors      = extract_names_from_df(posters_df, 'Poster authors')
    lightning_authors   = extract_names_from_df(lightning_df, 'Presentation authors')
    orals_authors       = extract_names_from_df(orals_df, 'Presentation authors')

    # Combine the results from all three files
    all_authors = poster_authors + lightning_authors + orals_authors
    print("Total extracted candidate count (including duplicates):", len(all_authors))

    # Remove duplicates
    unique_authors = list(set(all_authors))
    print("Unique candidate count after removing duplicates:", len(unique_authors))

    # Use fuzzy matching to group slightly variant names and select representative names
    final_names, clusters = cluster_names(unique_authors, threshold=90)
    final_names = sorted(final_names)

    # Set the output file path for the results
    output_path = '/content/drive/MyDrive/IC2S2_2024_final_person_names_from_csv.txt'
    with open(output_path, 'w', encoding='utf-8') as f:
        for name in final_names:
            f.write(name + "\n")

    print("Final unique person name count:", len(final_names))
    print("Result file saved at:", output_path)

if __name__ == "__main__":
    main()

Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.7/162.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Do

In [3]:
# Check Aggregated Names with fuzzywuzzy

import pandas as pd
import re
from fuzzywuzzy import fuzz
from collections import defaultdict
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

def extract_person_from_candidate(candidate):
    """
    Since each cell entry is formatted as "Person Name, Other Information",
    extract the text before the first comma as the person's name,
    and if an affiliation in parentheses exists, remove it.
    """
    candidate = candidate.strip()
    if ',' in candidate:
        person = candidate.split(',', 1)[0]
    else:
        person = candidate
    # Remove parentheses and the text within
    person = re.sub(r'\s*\(.*?\)', '', person)
    return person.strip()

def extract_names_from_cell(cell):
    """
    Assumes that the cell text is in the format "Person Name, Other Information; Person Name, Other Information; ..."
    Splits the text by semicolons (;) and applies the extract_person_from_candidate function to each part.
    """
    names = []
    parts = cell.split(';')
    for part in parts:
        part = part.strip()
        if not part:
            continue
        person_name = extract_person_from_candidate(part)
        if len(person_name.split()) >= 2:  # Assume a valid person name has at least two words
            names.append(person_name)
    return names

def extract_names_from_df(df, column_name):
    """
    From the specified column (e.g., 'Poster authors' or 'Presentation authors') of the given DataFrame,
    apply extract_names_from_cell() on each cell to extract all candidate person names.
    """
    authors = []
    if column_name in df.columns:
        for entry in df[column_name].dropna():
            authors.extend(extract_names_from_cell(entry))
    else:
        print(f"Column '{column_name}' does not exist. Available columns: {df.columns.tolist()}")
    return authors

def cluster_names_with_logging(names, threshold=90):
    """
    Uses fuzzywuzzy to group names with a similarity score above the threshold,
    treating them as the same individual. Within each cluster, the shortest name is selected
    as the representative, and the merged names are logged.

    Returns:
      representative_names: Final list of unique person names.
      clusters: List of names within each cluster (for debugging).
      merge_log: Dictionary mapping representative names to the merged names.
    """
    names_list = list(set(names))  # Remove duplicates
    clusters = []
    used = set()
    merge_log = defaultdict(list)  # Log for merged names

    for i, name in enumerate(names_list):
        if name in used:
            continue
        cluster = [name]
        used.add(name)
        for other in names_list[i+1:]:
            if other in used:
                continue
            score = fuzz.token_sort_ratio(name, other)
            if score >= threshold:
                cluster.append(other)
                used.add(other)
        clusters.append(cluster)
        representative_name = min(cluster, key=len)  # Select the shortest name as the representative
        for merged_name in cluster:
            if merged_name != representative_name:
                merge_log[representative_name].append(merged_name)

    representative_names = [min(cluster, key=len) for cluster in clusters]
    return representative_names, clusters, merge_log

def print_merge_results(merge_log):
    """A function to neatly print the merge results."""
    print("\n=== Merged Names Results ===")
    for representative, merged_names in merge_log.items():
        if merged_names:  # Only print if there are merged names
            print(f"\nRepresentative Name: {representative}")
            print(f"Merged Names: {', '.join(merged_names)}")
    print("\n============================")

def main():
    # Set CSV file paths (using paths in Google Drive)
    poster_csv = '/content/drive/MyDrive/IC2S2_2024_posters.csv'
    lightning_csv = '/content/drive/MyDrive/IC2S2_2024_lightning_talks.csv'
    orals_csv = '/content/drive/MyDrive/IC2S2_2024_oral_panels.csv'

    # Read the CSV files
    posters_df = pd.read_csv(poster_csv)
    lightning_df = pd.read_csv(lightning_csv)
    orals_df = pd.read_csv(orals_csv)

    # Extract candidate person names from the author columns of each DataFrame
    poster_authors = extract_names_from_df(posters_df, 'Poster authors')
    lightning_authors = extract_names_from_df(lightning_df, 'Presentation authors')
    orals_authors = extract_names_from_df(orals_df, 'Presentation authors')

    # Combine the results from all three files
    all_authors = poster_authors + lightning_authors + orals_authors

    # Remove duplicates and use fuzzy matching to group similar names, logging the merges
    final_names, clusters, merge_log = cluster_names_with_logging(all_authors, threshold=90)

    # Print the merge results
    print_merge_results(merge_log)

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

=== Merged Names Results ===

Representative Name: Ho Chun Herbert Chang
Merged Names: Ho-Chun Herbert Chang

Representative Name: Zou Yang
Merged Names: Yang Zhou

Representative Name: Yan Jiang
Merged Names: Yanru Jiang

Representative Name: Matthew F Asher
Merged Names: Dr Matthew F Asher

Representative Name: Nicolò Alessandro Girardini
Merged Names: Nicolas Alessandro Girardini

Representative Name: Yifan Wang
Merged Names: Yifang Wang

Representative Name: Eduardo López
Merged Names: Eduardo LÃ³pez



In [4]:
# Q3

import os

# File paths for 2023 and 2024 (using the paths saved from previous code)
file_2023 = '/content/drive/MyDrive/ic2s2_2023_researchers.txt'
file_2024 = '/content/drive/MyDrive/IC2S2_2024_final_person_names_from_csv.txt'

# Check if the files exist
if os.path.exists(file_2023):
    print("The 2023 file exists:", file_2023)
else:
    print("The 2023 file does not exist. Please check the path:", file_2023)

if os.path.exists(file_2024):
    print("The 2024 file exists:", file_2024)
else:
    print("The 2024 file does not exist. Please check the path:", file_2024)

# Load the researcher list for 2023
with open(file_2023, 'r', encoding='utf-8') as f:
    names_2023 = f.read().splitlines()
set_2023 = set(names_2023)

# Load the researcher list for 2024
with open(file_2024, 'r', encoding='utf-8') as f:
    names_2024 = f.read().splitlines()
set_2024 = set(names_2024)

# Calculate the intersection (common names) between the two files
common_names = set_2023.intersection(set_2024)

print("Both IC2S2 2023 and 2024 covered", len(common_names), "names.")
print("Common names:")
for name in sorted(common_names):
    print(name)

# Save the results to a text file on Google Drive
output_txt = '/content/drive/MyDrive/IC2S2_Common_names.txt'
with open(output_txt, 'w', encoding='utf-8') as f:
    for name in sorted(common_names):
        f.write(name + "\n")
print("The results have been saved to a text file:", output_txt)

The 2023 file exists: /content/drive/MyDrive/ic2s2_2023_researchers.txt
The 2024 file exists: /content/drive/MyDrive/IC2S2_2024_final_person_names_from_csv.txt
Both IC2S2 2023 and 2024 covered 286 names.
Common names:
Aaron Clauset
Aaron Schein
Abdullah Almaatouq
Adam Stefkovics
Agnieszka Czaplicka
Akhil Arora
Akira Matsui
Albert-Laszlo Barabasi
Alessandro Flammini
Alessia Antelmi
Alex Pentland
Alexander J Gates
Alexandra Segerberg
Aliakbar Akbaritabar
Alina Herderich
Allison Koenecke
Almog Simchon
Amirhossein Nakhaei
Anastasia Karpova
Andrea Passerini
Andreas Bjerre-Nielsen
Andrew Renninger
Andrés Gvirtz
Angelita Repetto
Anna Seo Gyeong Choi
Anne-Marie Nussberger
Antonio Longa
Anubhab Das
Arianna Pera
Artem Kuriksha
Ashton Anderson
Attila Varga
Ayan-Yue Gupta
Babak Heydari
Baird Howland
Bao Tran Truong
Bedoor AlShebli
Belén C Saldías Fuentes
Bernardo Garcia Bulle Bueno
Bhargav Srinivasa Desikan
Brenda Curtis
Brendan O'Connor
Brennan Klein
Brian Uzzi
Briony Swire-Thompson
Brooke Foucau