In [None]:
import pandas as pd

# Load the CSV file
file_path = "matching_data/ror_id.csv"
df = pd.read_csv(file_path)

# Replace "Russia" with "Russian Federation" in the "Country Name" column
df["Country Name"] = df["Country Name"].replace("Russian Federation", "Russia")

# Save the modified CSV
df.to_csv(file_path, index=False)

print("Preprocessing complete: 'Russian Federation' replaced with 'Russia'.")


Preprocessing complete: 'Russia' replaced with 'Russian Federation'.


In [1]:
import pandas as pd

# Load the CSV file
file_path = "matching_data/scopus_id.csv"
df = pd.read_csv(file_path)

# Replace "Russia" with "Russian Federation" in the "Country Name" column
df["Country Name"] = df["Country Name"].replace("Russian Federation", "Russia")

# Save the modified CSV
df.to_csv(file_path, index=False)

print("Preprocessing complete: 'Russian Federation' replaced with 'Russia'.")


Preprocessing complete: 'Russian Federation' replaced with 'Russia'.


Method 1, exact matching only

In [2]:
import csv

# Step 1: Read ror_id.csv and create the hashmap
ror_map = {}

with open('/home/rubiscol/metadata-extraction/matching_data/ror_id.csv', mode='r', encoding='utf-8') as ror_file:
    ror_reader = csv.DictReader(ror_file)
    for row in ror_reader:
        country = row['Country Name']
        institution = row['Institution Name']
        ror_id = row['ROR ID']
        
        if country not in ror_map:
            ror_map[country] = {}
        ror_map[country][institution] = ror_id

# Step 2: Read scopus_id.csv and match ROR IDs
matched_data = []

with open('/home/rubiscol/metadata-extraction/matching_data/scopus_id.csv', mode='r', encoding='utf-8') as scopus_file:
    scopus_reader = csv.DictReader(scopus_file)
    for row in scopus_reader:
        country = row['Country Name']
        primary_org = row['Primary Org Name']
        scopus_id = row['Scopus ID']
        
        ror_id = None
        if country in ror_map and primary_org in ror_map[country]:
            ror_id = ror_map[country][primary_org]
        
        matched_data.append({
            'Scopus ID': scopus_id,
            'Primary Org Name': primary_org,
            'Country Name': country,
            'ROR ID': ror_id
        })

# Step 3: Save the matched data to a new CSV file
with open('/home/rubiscol/metadata-extraction/matching_data/matched_results.csv', mode='w', encoding='utf-8', newline='') as output_file:
    fieldnames = ['Scopus ID', 'Primary Org Name', 'Country Name', 'ROR ID']
    writer = csv.DictWriter(output_file, fieldnames=fieldnames)
    
    writer.writeheader()
    for row in matched_data:
        writer.writerow(row)

print("Matching completed. Results saved to 'matching_data/matched_results.csv'.")

Matching completed. Results saved to 'matching_data/matched_results.csv'.


Method 2, exact matching combined with elastic searching

In [5]:
import csv
from tqdm import tqdm  # Import tqdm for progress bars

# Step 1: Read ror_id.csv and create the hashmap
ror_map = {}

with open('matching_data/ror_id.csv', mode='r', encoding='utf-8') as ror_file:
    ror_reader = csv.DictReader(ror_file)
    for row in tqdm(ror_reader, desc="Reading ROR data"):  # Add progress bar
        country = row['Country Name']
        institution = row['Institution Name']
        ror_id = row['ROR ID']
        
        if country not in ror_map:
            ror_map[country] = {}
        ror_map[country][institution] = ror_id

# Step 2: Read scopus_id.csv and match ROR IDs
matched_data = []

with open('matching_data/scopus_id.csv', mode='r', encoding='utf-8') as scopus_file:
    scopus_reader = csv.DictReader(scopus_file)
    for row in tqdm(scopus_reader, desc="Matching Scopus data"):  # Add progress bar
        country = row['Country Name']
        primary_org = row['Primary Org Name']
        scopus_id = row['Scopus ID']
        
        ror_id = None
        if country in ror_map:
            # Try exact match first
            if primary_org in ror_map[country]:
                ror_id = ror_map[country][primary_org]
            # else:
            
        
        matched_data.append({
            'Scopus ID': scopus_id,
            'Primary Org Name': primary_org,
            'Country Name': country,
            'ROR ID': ror_id
        })

# Step 3: Save the matched data to a new CSV file
with open('matching_data/matched_results.csv', mode='w', encoding='utf-8', newline='') as output_file:
    fieldnames = ['Scopus ID', 'Primary Org Name', 'Country Name', 'ROR ID']
    writer = csv.DictWriter(output_file, fieldnames=fieldnames)
    
    writer.writeheader()
    for row in tqdm(matched_data, desc="Writing results"):  # Add progress bar
        writer.writerow(row)

print("Matching completed. Results saved to 'matching_data/matched_results_elastic.csv'.")

Reading ROR data: 225636it [00:00, 361624.02it/s]
Matching Scopus data: 13365it [00:00, 274127.46it/s]
Writing results: 100%|██████████| 13365/13365 [00:00<00:00, 281810.37it/s]

Matching completed. Results saved to 'matching_data/matched_results_elastic.csv'.





In [None]:
import pandas as pd
import requests
import urllib.parse
import time
from tqdm import tqdm  # Import progress bar

# Constants
CSV_FILE = "/home/rubiscol/metadata-extraction/matching_data/matched_results.csv"
ROR_API_ENDPOINT = "https://api.dev.ror.org/organizations"
ES_RESERVED_CHARS = ["+", "-", "&", "|", "!", "(", ")", "{", "}", "[", "]", "^", '"', "~", "*", "?", ":", "\\", "/"]
RATE_LIMIT = 1800  # Max API calls allowed in 5 minutes
TIME_WINDOW = 300   # 5 minutes in seconds


def search_institution(name):
    """Search for an institution's ROR ID by name."""
    for char in ES_RESERVED_CHARS:
        name = name.replace(char, "\\" + char)
    
    params = {'query': name}
    response = requests.get(f"{ROR_API_ENDPOINT}?{urllib.parse.urlencode(params)}").json()
    
    if response.get('number_of_results', 0) > 0:
        return response['items'][0]['id']
    return None

# Load CSV file
df = pd.read_csv(CSV_FILE)

# Count API calls to respect rate limits
api_calls = 0
start_time = time.time()

# Process each row with a progress bar
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Updating ROR IDs"):
    if pd.isna(row['ROR ID']):  # If ROR ID is empty
        ror_id = search_institution(row['Primary Org Name'])
        if ror_id:
            df.at[index, 'ROR ID'] = ror_id
            df.to_csv(CSV_FILE, index=False)  # Save after each API call
        
        # Rate limiting logic
        api_calls += 1
        if api_calls >= RATE_LIMIT:
            elapsed_time = time.time() - start_time
            if elapsed_time < TIME_WINDOW:
                sleep_time = TIME_WINDOW - elapsed_time
                print(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)
            api_calls = 0
            start_time = time.time()

print("CSV file updated successfully.")


Updating ROR IDs:   2%|▏         | 284/13365 [01:07<1:01:58,  3.52it/s]

In [None]:
import pandas as pd

# Load the CSV file
file_path = "/home/rubiscol/metadata-extraction/matching_data/matched_results.csv"
df = pd.read_csv(file_path)

# Create a mask for rows where 'ROR ID' is empty (NaN or only whitespace)
empty_mask = df["ROR ID"].isna() | (df["ROR ID"].astype(str).str.strip() == "")
empty_count = empty_mask.sum()

# Calculate the total number of rows
total_rows = len(df)

# Calculate the percentage of rows with empty ROR ID
percentage_empty = (empty_count / total_rows * 100) if total_rows > 0 else 0

print("Number of rows with empty ROR ID:", empty_count)
print("Percentage of rows with empty ROR ID: {:.2f}%".format(percentage_empty))


Number of rows with empty ROR ID: 130
Percentage of rows with empty ROR ID: 0.97%
