# Assigment 1
### 02467 Computational Social Science Group 6

## Part 1: Web-scraping
### _Exercise: Web-scraping the list of participants to the International Conference in Computational Social Science_

In [None]:
# Week 1 - Web Scraping Q1

from google.colab import drive
import requests
from bs4 import BeautifulSoup
import re

# Mount Google Drive
drive.mount('/content/drive')

def clean_name(name):
    # Clean name - remove special characters and unnecessary whitespace
    name = re.sub(r'[,\n\t\r]', '', name)
    name = re.sub(r'\s+', ' ', name)
    name = name.strip()

    # Remove <u> tags
    name = re.sub(r'</?u>', '', name)

    # Exclude names that are too short or contain digits
    if len(name) < 2 or bool(re.search(r'\d', name)):
        return None

    return name

def extract_names_from_text(text):
    # Extract names from text
    # Create an empty list to hold names
    name_list = []

    # Handle authors list separated by commas
    if ',' in text:
        author_list = text.split(',')
        for author in author_list:
            name = clean_name(author)
            if name:
                name_list.append(name)
    else:
        name = clean_name(text)
        if name:
            name_list.append(name)

    return name_list

def parse_html_for_names(html_content):
    # Extract all researcher names from HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    # Remove duplicates
    name_list_to_set = set()

    # Find all list items containing presentation titles and author information
    for li in soup.find_all('li'):
        text = li.get_text()
        if not text:
            continue

        # Find <i> tags containing author information
        authors_tag = li.find('i')
        if authors_tag:
            authors_text = authors_tag.get_text()
            name_list = extract_names_from_text(authors_text)
            name_list_to_set.update(name_list)

    # Find session chairs
    chair_patterns = soup.find_all(string=re.compile(r'Chair:', re.IGNORECASE))
    for pattern in chair_patterns:
        chair_text = pattern.strip()
        if 'Chair:' in chair_text:
            chair_name = chair_text.split('Chair:')[1]
            name = clean_name(chair_name)
            if name:
                name_list_to_set.add(name)

    return sorted(list(name_list_to_set))

def main():
    # Set HTML file path (Google Drive path)
    file_path = '/content/drive/MyDrive/IC2S2_2023.html'
    # Alternatively, URL can be used directly

    # Read HTML file
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # Extract names
    names = parse_html_for_names(html_content)

    # Save results to file
    output_path = '/content/drive/MyDrive/ic2s2_2023_researchers.txt'
    df_path = '/content/drive/MyDrive/ic2s2_2023_researchers.csv'

    with open(output_path, 'w', encoding='utf-8') as f:
        for name in names:
            f.write(name + '\n')

    print(f"A total of {len(names)} unique researcher names have been extracted.")
    print(f"Results have been saved to {output_path} and {df_path}.")

if __name__ == "__main__":
    main()

In [None]:
# Q2

!pip install fuzzywuzzy[speedup]

import pandas as pd
import re
from fuzzywuzzy import fuzz
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

def extract_person_from_candidate(candidate):
    """
    Since each cell entry is in the format "Person Name, Other Information",
    extract the text before the first comma as the person's name.
    If an affiliation in parentheses is present, remove the parentheses and its content.

    Example:
      "Chris Kempes, ..."                  -> "Chris Kempes"
      "Chris Kempes (Santa Fe Institute)"  -> "Chris Kempes"
    """
    candidate = candidate.strip()
    # If there is a comma, use the text before the first comma; otherwise, use the entire text
    if ',' in candidate:
        person = candidate.split(',', 1)[0]
    else:
        person = candidate
    # Remove parentheses and the text within
    person = re.sub(r'\s*\(.*?\)', '', person)
    return person.strip()

def extract_names_from_cell(cell):
    """
    Assumes the cell text is in the format "Person Name, Other Information; Person Name, Other Information; ..."
    Splits the string by semicolons (;) and applies extract_person_from_candidate on each entry.
    Returns only those names that consist of two or more words.
    """
    names = []
    parts = cell.split(';')
    for part in parts:
        part = part.strip()
        if not part:
            continue
        person_name = extract_person_from_candidate(part)
        if len(person_name.split()) >= 2:
            names.append(person_name)
    return names

def extract_names_from_df(df, column_name):
    """
    From the specified column (e.g., 'Poster authors' or 'Presentation authors') of the given DataFrame,
    apply extract_names_from_cell() to extract all candidate person names.
    """
    authors = []
    if column_name in df.columns:
        for entry in df[column_name].dropna():
            authors.extend(extract_names_from_cell(entry))
    else:
        print(f"Column '{column_name}' does not exist. Available columns: {df.columns.tolist()}")
    return authors

def cluster_names(names, threshold=90):
    """
    Uses fuzzywuzzy's token_sort_ratio to group names that have a similarity score above the threshold,
    considering them as the same individual. Within each cluster, the shortest (cleanest) version of the name
    is selected as the representative.

    Returns:
      representative_names: Final list of unique person names
      clusters: List of names for each cluster (for debugging)
    """
    names_list = list(set(names))
    clusters = []
    used = set()
    for i, name in enumerate(names_list):
        if name in used:
            continue
        cluster = [name]
        used.add(name)
        for other in names_list[i+1:]:
            if other in used:
                continue
            score = fuzz.token_sort_ratio(name, other)
            if score >= threshold:
                cluster.append(other)
                used.add(other)
        clusters.append(cluster)
    representative_names = [min(cluster, key=len) for cluster in clusters]
    return representative_names, clusters

def main():
    # Set CSV file paths (using actual Google Drive paths)
    poster_csv    = '/content/drive/MyDrive/IC2S2_2024_posters.csv'
    lightning_csv = '/content/drive/MyDrive/IC2S2_2024_lightning_talks.csv'
    orals_csv     = '/content/drive/MyDrive/IC2S2_2024_oral_panels.csv'

    # Read CSV files
    posters_df   = pd.read_csv(poster_csv)
    lightning_df = pd.read_csv(lightning_csv)
    orals_df     = pd.read_csv(orals_csv)

    # Extract candidate names from the author columns of each DataFrame
    poster_authors      = extract_names_from_df(posters_df, 'Poster authors')
    lightning_authors   = extract_names_from_df(lightning_df, 'Presentation authors')
    orals_authors       = extract_names_from_df(orals_df, 'Presentation authors')

    # Combine the results from all three files
    all_authors = poster_authors + lightning_authors + orals_authors
    print("Total extracted candidate count (including duplicates):", len(all_authors))

    # Remove duplicates
    unique_authors = list(set(all_authors))
    print("Unique candidate count after removing duplicates:", len(unique_authors))

    # Use fuzzy matching to group slightly variant names and select representative names
    final_names, clusters = cluster_names(unique_authors, threshold=90)
    final_names = sorted(final_names)

    # Set the output file path for the results
    output_path = '/content/drive/MyDrive/IC2S2_2024_final_person_names_from_csv.txt'
    with open(output_path, 'w', encoding='utf-8') as f:
        for name in final_names:
            f.write(name + "\n")

    print("Final unique person name count:", len(final_names))
    print("Result file saved at:", output_path)

if __name__ == "__main__":
    main()

In [None]:
# Check Aggregated Names with fuzzywuzzy

import pandas as pd
import re
from fuzzywuzzy import fuzz
from collections import defaultdict
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

def extract_person_from_candidate(candidate):
    """
    Since each cell entry is formatted as "Person Name, Other Information",
    extract the text before the first comma as the person's name,
    and if an affiliation in parentheses exists, remove it.
    """
    candidate = candidate.strip()
    if ',' in candidate:
        person = candidate.split(',', 1)[0]
    else:
        person = candidate
    # Remove parentheses and the text within
    person = re.sub(r'\s*\(.*?\)', '', person)
    return person.strip()

def extract_names_from_cell(cell):
    """
    Assumes that the cell text is in the format "Person Name, Other Information; Person Name, Other Information; ..."
    Splits the text by semicolons (;) and applies the extract_person_from_candidate function to each part.
    """
    names = []
    parts = cell.split(';')
    for part in parts:
        part = part.strip()
        if not part:
            continue
        person_name = extract_person_from_candidate(part)
        if len(person_name.split()) >= 2:  # Assume a valid person name has at least two words
            names.append(person_name)
    return names

def extract_names_from_df(df, column_name):
    """
    From the specified column (e.g., 'Poster authors' or 'Presentation authors') of the given DataFrame,
    apply extract_names_from_cell() on each cell to extract all candidate person names.
    """
    authors = []
    if column_name in df.columns:
        for entry in df[column_name].dropna():
            authors.extend(extract_names_from_cell(entry))
    else:
        print(f"Column '{column_name}' does not exist. Available columns: {df.columns.tolist()}")
    return authors

def cluster_names_with_logging(names, threshold=90):
    """
    Uses fuzzywuzzy to group names with a similarity score above the threshold,
    treating them as the same individual. Within each cluster, the shortest name is selected
    as the representative, and the merged names are logged.

    Returns:
      representative_names: Final list of unique person names.
      clusters: List of names within each cluster (for debugging).
      merge_log: Dictionary mapping representative names to the merged names.
    """
    names_list = list(set(names))  # Remove duplicates
    clusters = []
    used = set()
    merge_log = defaultdict(list)  # Log for merged names

    for i, name in enumerate(names_list):
        if name in used:
            continue
        cluster = [name]
        used.add(name)
        for other in names_list[i+1:]:
            if other in used:
                continue
            score = fuzz.token_sort_ratio(name, other)
            if score >= threshold:
                cluster.append(other)
                used.add(other)
        clusters.append(cluster)
        representative_name = min(cluster, key=len)  # Select the shortest name as the representative
        for merged_name in cluster:
            if merged_name != representative_name:
                merge_log[representative_name].append(merged_name)

    representative_names = [min(cluster, key=len) for cluster in clusters]
    return representative_names, clusters, merge_log

def print_merge_results(merge_log):
    """A function to neatly print the merge results."""
    print("\n=== Merged Names Results ===")
    for representative, merged_names in merge_log.items():
        if merged_names:  # Only print if there are merged names
            print(f"\nRepresentative Name: {representative}")
            print(f"Merged Names: {', '.join(merged_names)}")
    print("\n============================")

def main():
    # Set CSV file paths (using paths in Google Drive)
    poster_csv = '/content/drive/MyDrive/IC2S2_2024_posters.csv'
    lightning_csv = '/content/drive/MyDrive/IC2S2_2024_lightning_talks.csv'
    orals_csv = '/content/drive/MyDrive/IC2S2_2024_oral_panels.csv'

    # Read the CSV files
    posters_df = pd.read_csv(poster_csv)
    lightning_df = pd.read_csv(lightning_csv)
    orals_df = pd.read_csv(orals_csv)

    # Extract candidate person names from the author columns of each DataFrame
    poster_authors = extract_names_from_df(posters_df, 'Poster authors')
    lightning_authors = extract_names_from_df(lightning_df, 'Presentation authors')
    orals_authors = extract_names_from_df(orals_df, 'Presentation authors')

    # Combine the results from all three files
    all_authors = poster_authors + lightning_authors + orals_authors

    # Remove duplicates and use fuzzy matching to group similar names, logging the merges
    final_names, clusters, merge_log = cluster_names_with_logging(all_authors, threshold=90)

    # Print the merge results
    print_merge_results(merge_log)

if __name__ == "__main__":
    main()

In [None]:
# Q3

import os

# File paths for 2023 and 2024 (using the paths saved from previous code)
file_2023 = '/content/drive/MyDrive/ic2s2_2023_researchers.txt'
file_2024 = '/content/drive/MyDrive/IC2S2_2024_final_person_names_from_csv.txt'

# Check if the files exist
if os.path.exists(file_2023):
    print("The 2023 file exists:", file_2023)
else:
    print("The 2023 file does not exist. Please check the path:", file_2023)

if os.path.exists(file_2024):
    print("The 2024 file exists:", file_2024)
else:
    print("The 2024 file does not exist. Please check the path:", file_2024)

# Load the researcher list for 2023
with open(file_2023, 'r', encoding='utf-8') as f:
    names_2023 = f.read().splitlines()
set_2023 = set(names_2023)

# Load the researcher list for 2024
with open(file_2024, 'r', encoding='utf-8') as f:
    names_2024 = f.read().splitlines()
set_2024 = set(names_2024)

# Calculate the intersection (common names) between the two files
common_names = set_2023.intersection(set_2024)

print("Both IC2S2 2023 and 2024 covered", len(common_names), "names.")
print("Common names:")
for name in sorted(common_names):
    print(name)

# Save the results to a text file on Google Drive
output_txt = '/content/drive/MyDrive/IC2S2_Common_names.txt'
with open(output_txt, 'w', encoding='utf-8') as f:
    for name in sorted(common_names):
        f.write(name + "\n")
print("The results have been saved to a text file:", output_txt)

### _How many unique researchers do you get?_
#### We got 1484 unique researchers for our answer.

### _Explain the process you followed to web-scrape the page. Which choices did you make to accurately retreive as many names as possible? Which strategies did you use to assess the quality of your final list? Explain your reasoning and your choices (answer in max 150 words)._
#### We first cleaned the names by removing any special character and/or unnecessary white spaces and commas, and excluded names that were too short or contained numbers. We then removed the word "Chair" from the session chairs whose names were saved as "Chair: (name)". Lastly, we added these cleaned names into a list. Using the list, we retrieved the information needed using BeautifulSoup. We then sorted the list by the names to get our final answer.


###[edited_Jiwon Heo]
To extract researcher names, we utilized a systematic process combining HTML parsing and CSV file analysis. Using `BeautifulSoup`, we parsed the HTML to locate relevant tags containing names. A cleaning function removed special characters, unnecessary whitespace, and invalid entries (e.g., names with numbers or too short). For session chairs, we specifically removed the word "Chair" from entries like "Chair: (name)." From CSV files, names were extracted by splitting author columns based on delimiters such as commas and semicolons.

To ensure accuracy, fuzzy matching (`fuzzywuzzy`) was applied to group similar names and resolve variations caused by typos or format differences. The cleaned names were deduplicated and sorted into a final list. Quality was assessed by tracking unique name counts at each stage and manually reviewing clusters of similar names. This approach ensured comprehensive name retrieval while maintaining high data quality.

## Part 2: Ready Made vs Custom Made Data

### Centola
#### Pros:
#### - Flexibility, data collection in real time can be modified
#### - Custom made data can be designed to address specific questions
#### Cons:
#### - Social network is something complex, researchers might accidentally introduce bias
#### - Might not fully represent real world conditions


### Nicolaide
#### Pros:
#### - Saves time and cost
#### - Larger sample size
#### Cons:
#### - Quality of data might not be the best
#### - Researchers do not have control over how the data was made

### _How do you think these differences can influence the interpretation of the results in each study?_
#### Centola allows for controlled experiments with specific manipulations while Nicolaides provide a real world view. However, Centola’s study might not be generalisable to the general population, while Nicolaide’s study may have causal correlations and uncertainty.

## Part 3: Gathering Research Articles using the OpenAlex API

In [None]:
import requests
import pandas as pd
from joblib import Parallel, delayed

df = pd.read_csv('file02.csv')
base_url = "https://api.openalex.org/"
papers = []
abstracts = []

socialscience = {
    "Sociology": "https://openalex.org/C144024400",
    "Psychology": "https://openalex.org/C15744967",
    "Economics": "https://openalex.org/C162324750",
    "Political Science": "https://openalex.org/C17744445"
}

quantitative = {
    "Mathematics": "https://openalex.org/C33923547",
    "Physics": "https://openalex.org/C121332964",
    "Computer Science": "https://openalex.org/C41008148"
}

def getwork(works_api_url, per_page=200):
    works = []
    cursor = "*"  # initial cursor
    while True:
        url = f"{works_api_url}&per-page={per_page}&cursor={cursor}"
        response = requests.get(url)
        if response.status_code != 200: break

        data = response.json()
        works.extend(data.get('results', []))

        # next cursor for pagination
        next_cursor = data.get('meta', {}).get('next_cursor')
        if not next_cursor: break
        cursor = next_cursor
    return works

def filtering(works):
    filtered = []
    for work in works:
        if work.get('cited_by_count', 0) <= 10: continue  #more than 10 citations
        if len(work.get('authorships', [])) >= 10: continue #fewer than 10 authors

        concept_ids = [concept.get('id') for concept in work.get('concepts', [])]
        is_SS = any(concept in socialscience.values() for concept in concept_ids)
        is_quant = any(concept in quantitative.values() for concept in concept_ids)

        if is_SS and is_quant: #works relevant to Computational Social Science  AND intersecting with a quantitative discipline
            filtered.append(work)
    return filtered

def filtering2(id, worksurl, count): # Only if the author has between 5 and 5000 works
    if 5 <= count <= 5000:
        works = getwork(worksurl)
        filtered_works = filtering(works)
        return filtered_works
    return []

def extraction(work):
    return {
        'id': work.get('id'),
        'publication_year': work.get('publication_year'),
        'cited_by_count': work.get('cited_by_count'),
        'author_ids': [author.get('author', {}).get('id') for author in work.get('authorships', [])],
        'title': work.get('title'),
        'abstract_inverted_index': work.get('abstract_inverted_index'),
        'referenced_works': work.get('referenced_works', []),
        'cited_by_api_url': work.get('cited_by_api_url'),
        'related_works': work.get('related_works', [])
    }

# parallelize fetching and filtering works using joblib
allfiltered = Parallel(n_jobs=-1)(
    delayed(filtering2)(row['OpenAlex ID'], row['Works API URL'], row['Works Count'])
    for _, row in df.iterrows()
)

for all in allfiltered:
    for work in all:
        details = extraction(work)

        papers.append({
            'id': details['id'],
            'publication_year': details['publication_year'],
            'cited_by_count': details['cited_by_count'],
            'author_ids': details['author_ids'],
            'referenced_works': details['referenced_works'],
            'cited_by_api_url': details['cited_by_api_url'],
            'related_works': details['related_works']
        })

        abstracts.append({
            'id': details['id'],
            'title': details['title'],
            'abstract_inverted_index': details['abstract_inverted_index']
        })

papers_df = pd.DataFrame(papers)
abstracts_df = pd.DataFrame(abstracts)

papers_df.to_csv('papers.csv', index=False)
abstracts_df.to_csv('abstracts.csv', index=False)
print("Data saved.")

### Data Overview and Reflection questions:

### _Dataset summary. How many works are listed in your IC2S2 papers dataframe? How many unique researchers have co-authored these works?_
#### Number of works: 11230
#### Number of unique researchers: 15199

### _Efficiency in code. Describe the strategies you implemented to make your code more efficient. How did your approach affect your code's execution time?_
#### As suggested, I used joblib's parallel function to handle multiple requests. I also implemented the filters required (work count between 5-5000, works with more than 10 citations, works authored by fewer than 10 individuals and works related to computational social science). This allowed the code to be executed at a faster rate.

### _Filtering Criteria and Dataset Relevance Reflect on the rationale behind setting specific thresholds. How do these filtering criteria contribute to the relevance of the dataset you compiled?_
#### Work count filter: >5 work count allows us to focus on established authors, while <5000 work count removes authors who have too many work that could otherwise produce unwanted "noise"
#### More than 10 citations: citations allow us to judge the influence of a work, and this filter will allow us to get datasets that has some sort of influence in the academic field
#### Less than 10 authors per work: <10 authors would suggest that the work was collaborative and focused, as too many authors may cause a paper to have too many ideas and may not reflect clear insights
#### Relevance to Computational Social Science: as the course is related to computational social science, it is important that the works we find are related to it

### _Do you believe any aspects of Computational Social Science research might be underrepresented or overrepresented as a result of these choices?_
#### Yes, I believe that there may be some sort of underrepresentation due to such filters.
#### Firstly, there may be newly published work that currently have <10 citations but is still highly relevant to the field. The filter would then exclude such works which may sometimes be highly niche that results in the lower citation count. Secondly, filtering of <10 authors could potentially exclude works that were done by a large team. It could have been a complex topic that required a large team to collaborate on, which would still be relevant to the field but are excluded due to such filters.