# Organising Committee Indentifier

This is a first prototype an identifier for a conference organiser.

Steps:
1. Extracting information from the Call for Papers
2. Identifying organisers in OpenAlex
3. Locating conference or event on the main databases (such as DBLP, AIDA Dashboard, or [ConfIDent](https://www.confident-conference.org/index.php/Category:Event_Series))

In [3]:
from IPython.core.display import JSON
import json
import pandas as pd

In [4]:
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
topsecret = config['DEFAULT']['api_key']

In [5]:
from openai import OpenAI
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=topsecret,
)

# Reading CfP and creating prompt

In [32]:
SOURCE = 'cfps'
CFP = 'semantics2024.txt'
with open(f'{SOURCE}/{CFP}','r') as fr:
    call_for_papers = fr.read()
# print(call_for_papers)

In [33]:
text_prompt = f"""In this prompt, you will receive a Call for Papers of a scientific event. Your task is to parse it, and identify some crucial elements:

- the event name and its acronym;
- the location of the event
- the organisers of the event

<call_for_papers>
{call_for_papers}
</call_for_papers>"""
# print(text_prompt)

In [34]:
true = True
false = False
extra_headers={
"HTTP-Referer": "https://skm.kmi.open.ac.uk", # Optional. Site URL for rankings on openrouter.ai.
"X-Title": "SKM Team at OU", # Optional. Site title for rankings on openrouter.ai.
}
model="openai/gpt-4o"
messages= [
{ "role": "user", "content": text_prompt }
]
response_format={
"type": "json_schema",
"json_schema": {
  "name": "organising_committe_of_conference",
  "strict": true,
  "schema": {
    "type": "object",
    "properties": {
      "event_name": {
        "type": "string",
        "description": "Name of the workshop or conference. This identifies the extended name of the event."
      },
      "conference_series": {
        "type": "string",
        "description": "This refers to the name of a conference series, which is a collection of events that happen on a regular basis. It's usually similar to the event's name, but without the edition number or the year."
      },
      "event_acronym": {
        "type": "string",
        "description": "Acronym of the workshop or conference. This identifies the acronym name of the event."
      },
      "colocated_with": {
        "type": "string",
        "description": "If the name of the event is co-located with another big event. Otherwise if empty."
      },
      "location": {
        "type": "string",
        "description": "City or location name"
      },
      "organisers": {
        "type": "array",
        "items": {
                "type": "object",
                "properties": {
                    "organiser_name": {
                        "type": "string",
                        "description": "The organiser name."
                    },
                    "organiser_affiliation": {
                        "type": "string",
                        "description": "The institution (affiliation) of the organiser. This can be either a university or a company."
                    },
                    "organiser_country": {
                        "type": "string",
                        "description": "The institution country of the organiser. This information is not always available."
                    },
                    "track_name": {
                        "type": "string",
                        "description": "This identifies the main track in which the organiser is involved. A conference may have several tracks, whereas a workshop may have one single track. As default you shall use 'main'."
                    }
                },
            "required": ["organiser_name", "organiser_affiliation", "organiser_country", "track_name"],
            "additionalProperties": false
            },
        "description": "Identifies the name, affiliation (ideally including country) of the conference organisers and the name of the track they organise."
      }
    },
    "required": ["event_name", "event_acronym", "conference_series", "colocated_with", "location", "organisers"],
    "additionalProperties": false
  }
}
}

# Running the extraction

In [35]:
completion = client.chat.completions.create(extra_headers=extra_headers, 
                                            model=model, 
                                            messages=messages, 
                                            response_format=response_format)

# JSON(completion.choices[0].message.content)

In [36]:
result = json.loads(completion.choices[0].message.content)
JSON(result)

<IPython.core.display.JSON object>

# Find Organisers on OpenAlex

In [38]:
from pyalex import config
from pyalex import Authors, Institutions
import logging
from rapidfuzz import fuzz
from rapidfuzz.distance import Levenshtein

config.email = "angelo.salatino@open.ac.uk"
config.max_retries = 0
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]

In [47]:
def get_authors_info_from_openalex(organisers:list)->list:
    """
    
    This is a convoluted algorithm. 
    First it attempts a double filtering (institution + authorname), for a more precise outcome.
    However, as in many cases it fails (affiliations in CfP are not similar to the institution name in OpenAlex), 
    we simply retrieve author info based on their name, and try to find the correct authors within the returned pool.
    
    

    Parameters
    ----------
    organisers : list
        list of organisers.

    Returns
    -------
    list
        the same list of organisers with augmented information from OpenAlex.

    """

    ### First we attempt to located the organiser by finding their affiliation, and filtering them by affiliation
    ### Second attempt is made when the affiliation is not clear.
    
    
    DEBUG = False
    for organiser in organisers:
        
        organiser["openalex_name"] = ""
        organiser["openalex_page"] = ""
        organiser["orcid"] = ""
        organiser["affiliation_ror"] = ""
        
        find_author_with_less_info = False
        orga = {}
        if len(organiser["organiser_affiliation"]) > 0:
            # Search for the institution and then filtering
            insts = Institutions().search(organiser["organiser_affiliation"]).get()
            if len(insts) > 0:
                inst_id = insts[0]["id"].replace("https://openalex.org/", "")
        
                if "ror" in insts[0]["ids"]:
                    organiser["affiliation_ror"] = insts[0]["ids"]["ror"]
                
                # Search for the author within the institution
                auths = Authors().search(organiser["organiser_name"]).filter(affiliations={"institution":{"id": inst_id}}).get()
                if len(auths) > 0:        
                    if DEBUG: print(f"{len(auths)} search results found for the author")
                    orga = auths[0]
                else:
                    find_author_with_less_info = True
                    if DEBUG: print(f"For {organiser['organiser_name']} I could not find a record")
                
                    
            else:
                find_author_with_less_info = True
                if DEBUG: print(f"For {organiser['organiser_name']} I could not find a record of their institution")
        else:
            find_author_with_less_info = True
            if DEBUG: print(f"For {organiser['organiser_name']} there is no affiliation")
    
        # Search for authors without institution info
        if find_author_with_less_info:
            auths = Authors().search(organiser['organiser_name']).get()
            if len(auths) == 1:
                orga = auths[0]
            elif len(auths) == 0:
                if DEBUG: print(f"For {organiser['organiser_name']} I could not find a record, AGAIN")
            else:
                if DEBUG: print(f"Found multiple records for {organiser['organiser_name']}")
                new_auths = sorted(auths, key=lambda item: item['works_count'], reverse=True)
    
                # this algorithm makes sure we match the author with the most similar name
                max_similarity = 0
                final_position = -1
                for author_position, new_auth in enumerate(new_auths):
                    all_alternative_names = new_auth["display_name_alternatives"]
                    for alternative_name in all_alternative_names:
                        author_similarity = Levenshtein.normalized_similarity(alternative_name,organiser['organiser_name'])
                        if author_similarity > max_similarity:
                            if DEBUG: print(f"{alternative_name}; {author_position}; {author_similarity}")
                            max_similarity = author_similarity
                            final_position = author_position
    
                orga = new_auths[final_position]
    
        if len(orga) > 0:
            organiser["openalex_name"] = orga["display_name"]
            organiser["openalex_page"] = orga["id"]
            organiser["orcid"] = orga["orcid"]
            if len(organiser["organiser_affiliation"]) > 0:
                if organiser["affiliation_ror"] == "":
                    last_known_institutions = orga["last_known_institutions"]
                    max_similarity = 0
                    final_position = -1
                    for institution_position, last_known_institution in enumerate(last_known_institutions):
                        institute_name = last_known_institution["display_name"]
                        institution_similarity = fuzz.token_set_ratio(institute_name,organiser["organiser_affiliation"])
                        if institution_similarity > max_similarity:
                            if DEBUG: print(f"{institute_name}; {institution_position}; {institution_similarity}")
                            max_similarity = institution_similarity
                            final_position = institution_position
                    if max_similarity >= 40:        
                        organiser_institution_from_OA = last_known_institutions[final_position]
                        organiser["affiliation_ror"] = organiser_institution_from_OA["ror"]
    
    return organisers

result["organisers"] = get_authors_info_from_openalex(result["organisers"])

In [48]:
JSON(result["organisers"])

<IPython.core.display.JSON object>

In [16]:
JSON(result)

<IPython.core.display.JSON object>

In [49]:
organisers = pd.DataFrame.from_dict(result["organisers"])
organisers.rename(columns={"organiser_name": "Name",
                          "organiser_affiliation": "Affiliation",
                          "organiser_country": "Country",
                          "track_name": "Track",
                          "affiliation_ror": "ROR",
                          "openalex_page": "OpenAlex Profile",
                          "orcid": "ORCID",
                          "openalex_name": "OpenAlex Name"
                          })

Unnamed: 0,Name,Affiliation,Country,Track,OpenAlex Name,OpenAlex Profile,ORCID,ROR
0,Mehwish Alam,,,Research and Innovation,Mehwish Alam,https://openalex.org/A5009026163,https://orcid.org/0000-0002-7867-6612,
1,Femke Ongenae,,,Research and Innovation,Femke Ongenae,https://openalex.org/A5008111431,https://orcid.org/0000-0003-2529-5477,
2,Angelo Salatino,,,Research and Innovation,Angelo A. Salatino,https://openalex.org/A5074395069,https://orcid.org/0000-0002-4763-3943,


In [65]:
def check_series(x):
    for i in x:
        if len(i) > 0:
            return True
    return False

final_list_columns = organisers.apply(lambda x: check_series(x), axis=0)
organisers[final_list_columns[final_list_columns==True].index]


Unnamed: 0,organiser_name,track_name,openalex_name,openalex_page,orcid
0,Mehwish Alam,Research and Innovation,Mehwish Alam,https://openalex.org/A5009026163,https://orcid.org/0000-0002-7867-6612
1,Femke Ongenae,Research and Innovation,Femke Ongenae,https://openalex.org/A5008111431,https://orcid.org/0000-0003-2529-5477
2,Angelo Salatino,Research and Innovation,Angelo A. Salatino,https://openalex.org/A5074395069,https://orcid.org/0000-0002-4763-3943


# Find Event on major databases

In [13]:
from sentence_transformers import SentenceTransformer
import faiss  
import pickle
import urllib.parse

# import re
# regex = r'(\d+st)|(\d+nd)|(\d+rd)|(\d+th)'
# result = re.sub(regex, "", result["event_name"], 0, re.MULTILINE | re.IGNORECASE)

# Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode([result["conference_series"]])

  from tqdm.autonotebook import tqdm, trange


In [14]:
with open('DBLP.pickle', 'rb') as handle:
    dblp_confs = pickle.load(handle)

D, I = dblp_confs["index"].search(embeddings, k=1)
if D[0][0] <= 0.4:
    this_conf = dblp_confs["sentences"][I[0][0]]
    this_acronym = dblp_confs["confs"][this_conf]
    print(this_conf)
    print(this_acronym)
    print(f"https://dblp.org/streams/conf/{urllib.parse.quote(this_acronym, safe='')}")

International Semantic Web Conference (ISWC)
semweb
https://dblp.org/streams/conf/semweb


In [15]:
result["DBLP"]=dict()
result["DBLP"]["name"]= this_conf
result["DBLP"]["id"]  = this_acronym
result["DBLP"]["url"] = f"https://dblp.org/streams/conf/{urllib.parse.quote(this_acronym, safe='')}"

In [9]:
with open('AIDA.pickle', 'rb') as handle:
    aida_confs = pickle.load(handle)

D, I = aida_confs["index"].search(embeddings, k=1)
if D[0][0] <= 0.4:
    this_conf_aida = aida_confs["sentences"][I[0][0]]
    this_acronym_aida = aida_confs["confs"][this_conf_aida]
    print(this_conf_aida)
    print(this_acronym_aida)
    print(f"https://w3id.org/aida/dashboard/cs/conference/{urllib.parse.quote(this_conf_aida, safe='')}")

NameError: name 'pickle' is not defined

In [17]:
result["AIDA"]=dict()
result["AIDA"]["name"]= this_conf_aida
result["AIDA"]["id"]  = this_acronym_aida
result["AIDA"]["url"] = f"https://w3id.org/aida/dashboard/cs/conference/{urllib.parse.quote(this_conf_aida, safe='')}"

In [19]:
with open('ConfIDent.pickle', 'rb') as handle:
    confident_confs = pickle.load(handle)

D, I = confident_confs["index"].search(embeddings, k=1)
if D[0][0] <= 0.4:
    this_conf_confident = confident_confs["sentences"][I[0][0]]
    this_id_confident = confident_confs["confs"][this_conf_confident]
    print(this_conf_confident)
    print(this_id_confident)
    print(f"https://www.confident-conference.org/index.php/{urllib.parse.quote(this_id_confident, safe='')}")

International Semantic Web Conference
Event Series:ISWC
https://www.confident-conference.org/index.php/Event%20Series%3AISWC


In [20]:
result["ConfIDent"]=dict()
result["ConfIDent"]["name"]= this_conf_confident
result["ConfIDent"]["id"]  = this_id_confident
result["ConfIDent"]["url"] = f"https://www.confident-conference.org/index.php/{urllib.parse.quote(this_id_confident, safe='')}"

In [21]:
JSON(result)

<IPython.core.display.JSON object>

In [22]:
with open("processed_cfps/iswc2025.json","w") as fw:
    json.dump(result,fw)

In [None]:
json.dumps(result)

In [4]:
with open("processed_cfps/iswc2025.json","r") as fr:
    result = json.load(fr)

In [5]:
JSON(result)

<IPython.core.display.JSON object>