In [27]:
import pandas as pd

# Load the CSV file. Adjust the path if needed.
df = pd.read_csv("data/2201.00_scopus_931.csv")

# Read the blacklist file (assuming one organization per line)
with open("data/blacklist_parent_organizations.txt", "r", encoding="utf-8") as f:
    blacklist = [line.strip() for line in f if line.strip()]

# Convert the blacklist to lowercase for case-insensitive matching.
blacklist_lower = [org.lower() for org in blacklist]

# Assuming the institution name is in a column called 'institution'.
institution_col = 'Primary Org Name'

# Create a boolean mask for institutions that appear in the blacklist.
mask_blacklist = df[institution_col].str.lower().isin(blacklist_lower)

# Create a boolean mask for institutions that contain both "university" and "system" (case-insensitive).
mask_university_system = (
    df[institution_col].str.lower().str.contains('university', na=False) &
    df[institution_col].str.lower().str.contains('system', na=False)
)

# Create a boolean mask for institutions that contain "Government of India" (case-insensitive).
mask_govt_india = df[institution_col].str.lower().str.contains('government of india', na=False)

# Combine masks: mark for removal if any condition is met.
mask_remove = mask_blacklist | mask_university_system | mask_govt_india

# Filter out the records to be removed.
df_filtered = df[~mask_remove].copy()

# Optional: Reset index if desired.
df_filtered.reset_index(drop=True, inplace=True)

# Save the filtered dataframe to a new CSV file.
df_filtered.to_csv("data/2201.00_scopus_931.csv", index=False)

print("Filtered DataFrame:")
print(df_filtered)


Filtered DataFrame:
      Primary Org Id                              Primary Org Name  \
0           60030612           University of California, San Diego   
1           60017604  Siberian Branch, Russian Academy of Sciences   
2           60023932               University of Technology Sydney   
3           60016849                         University of Toronto   
4           60024542          South China University of Technology   
...              ...                                           ...   
2896        60017246           Eberhard Karls Universität Tübingen   
2897        60123796                         Université Paris Cité   
2898        60123796                         Université Paris Cité   
2899        60025578                             Xidian University   
2900        60005816                 South China Normal University   

     Primary Org City   Primary Org State Primary Org Country      ArXiv Id  \
0            La Jolla                  CA       United State

In [28]:
import os
import pandas as pd
import json

# Load the datasets
scopus_data = pd.read_csv('data/2201.00_scopus_931.csv')
ror_mapping = pd.read_csv('matching_data/matched_results_ror_api.csv')

# Merge the datasets on 'Primary Org Id'
merged_data = pd.merge(
    scopus_data,
    ror_mapping,
    on='Primary Org Id',
    how='left'  # Use 'left' to keep all rows from scopus_data even if no match is found
)

# Calculate the number of unmatched rows and print match rate
unmatched_count = merged_data['ROR ID'].isna().sum()
print(f"Number of unmatched 'Primary Org Id' values: {unmatched_count}")
print("Match rate:", 1 - unmatched_count / len(merged_data))

# Group by 'ArXiv Id' and aggregate non-null 'ROR ID' values into a list
result = merged_data.groupby('ArXiv Id')['ROR ID'].apply(lambda x: x.dropna().tolist()).to_dict()

# Ensure the directory exists; create it if it's empty or missing
output_dir = 'data/2201.00_scopus_931'
os.makedirs(output_dir, exist_ok=True)

# Save the result to a JSON file within the created directory
output_path = os.path.join(output_dir, 'groundTruth.json')
with open(output_path, 'w') as f:
    json.dump(result, f, indent=4)

print(f"JSON export completed. The output is saved as '{output_path}'.")


Number of unmatched 'Primary Org Id' values: 27
Match rate: 0.9906928645294726
JSON export completed. The output is saved as 'data/2201.00_scopus_931/groundTruth.json'.


In [None]:
import pandas as pd
from extractors.trie_extractor import TrieExtractor
from utils.file_reader import read_file
from tqdm import tqdm  # Import tqdm for progress bar

# Initialize the TrieExtractor
extractor = TrieExtractor(data_path="data/1.34_extracted_ror_data.csv", common_words_path="data/common_english_words.txt")

In [None]:
# Function to extract ROR ID from a given text file based on the paper's ArXiv Id
def extract_ror_id(paper_id):
    file_path = f"data/2201_00_text/{paper_id}.txt"
    try:
        text = read_file(file_path)
        affiliations = extractor.extract_affiliations(text)
        # Convert set to list if needed, and return the result if available
        if affiliations:
            return list(affiliations) if isinstance(affiliations, set) else affiliations
        else:
            return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None

# Path to the CSV file containing the "ArXiv Id" column
csv_path = "data/2201.00_scopus_931.csv"
df = pd.read_csv(csv_path)

# Apply the extraction function to each ArXiv Id with a progress bar
tqdm.pandas()
df['Extracted ROR ID'] = df['ArXiv Id'].progress_apply(extract_ror_id)

# Create a dictionary mapping each ArXiv Id to its extracted ROR ID(s)
result = df.set_index('ArXiv Id')['Extracted ROR ID'].to_dict()

# Ensure the output directory exists; create it if not
output_dir = 'data/2201.00_scopus_931'
os.makedirs(output_dir, exist_ok=True)

# Save the dictionary to a JSON file with keys ordered
output_json_path = os.path.join(output_dir, 'result.json')
with open(output_json_path, 'w') as f:
    json.dump(result, f, indent=4, sort_keys=True)

print(f"Extraction complete. JSON saved as {output_json_path}.")


In [None]:
import os
import json

# Define the folder containing the JSON files
folder = 'data/2201.00_scopus_931'
result_path = os.path.join(folder, 'result.json')
ground_truth_path = os.path.join(folder, 'groundTruth.json')

# Load the JSON files
with open(result_path, 'r') as f:
    result_data = json.load(f)

with open(ground_truth_path, 'r') as f:
    ground_truth_data = json.load(f)

# Initialize counters
total_ror_ids = 0
correct_extractions = 0
wrong_extractions = 0

# Iterate over each ArXiv ID in the ground truth data
for arxiv_id, true_val in ground_truth_data.items():
    # Skip if ground truth is None; else, ensure we work with a set
    if true_val is None:
        continue
    if isinstance(true_val, list):
        gt_set = set(true_val)
    else:
        gt_set = {true_val}
    
    # Update total ROR IDs count from ground truth
    total_ror_ids += len(gt_set)
    
    # Get the corresponding extracted value
    res_val = result_data.get(arxiv_id)
    if res_val is None:
        extracted_set = set()
    elif isinstance(res_val, list):
        extracted_set = set(res_val)
    else:
        extracted_set = {res_val}
    
    # Count correct extractions: intersection between ground truth and extraction
    correct_extractions += len(gt_set & extracted_set)
    # Count wrong extractions: any extracted ROR IDs not in the ground truth
    wrong_extractions += len(extracted_set - gt_set)

# Calculate accuracy and wrong extraction rate based on total ground truth ROR IDs
accuracy = correct_extractions / total_ror_ids if total_ror_ids > 0 else 0
wrong_extraction_rate = wrong_extractions / total_ror_ids if total_ror_ids > 0 else 0

print("Total ground truth ROR IDs:", total_ror_ids)
print("Correct extractions:", correct_extractions)
print("Wrong extractions:", wrong_extractions)
print("Accuracy: {:.2%}".format(accuracy))
print("Wrong extraction rate: {:.2%}".format(wrong_extraction_rate))


In [None]:
import os
import json
import pandas as pd

# Define the folder containing the JSON files
folder = 'data/2201.00_scopus_931'
result_path = os.path.join(folder, 'result.json')
ground_truth_path = os.path.join(folder, 'groundTruth.json')

# Load the JSON files
with open(result_path, 'r') as f:
    result_data = json.load(f)

with open(ground_truth_path, 'r') as f:
    ground_truth_data = json.load(f)

# List to store mismatched cases
rows = []

# Iterate over each ArXiv ID in the ground truth
for arxiv_id, gt_val in ground_truth_data.items():
    # Convert ground truth value to a set (or empty set if None)
    if gt_val is None:
        gt_set = set()
    elif isinstance(gt_val, list):
        gt_set = set(gt_val)
    else:
        gt_set = {gt_val}
    
    # Get the corresponding extracted value and convert to a set
    res_val = result_data.get(arxiv_id)
    if res_val is None:
        extracted_set = set()
    elif isinstance(res_val, list):
        extracted_set = set(res_val)
    else:
        extracted_set = {res_val}
    
    # Calculate missing and extra ROR IDs
    missing = gt_set - extracted_set  # IDs in ground truth but not in extraction
    extra = extracted_set - gt_set    # IDs in extraction but not in ground truth

    # Record this case only if there is any mismatch
    if missing or extra:
        rows.append({
            'ArXiv Id': arxiv_id,
            'Ground Truth ROR IDs': sorted(list(gt_set)),
            'Extracted ROR IDs': sorted(list(extracted_set)),
            'Missing ROR IDs': sorted(list(missing)),
            'Extra ROR IDs': sorted(list(extra))
        })

# Create a DataFrame from the mismatched cases
df = pd.DataFrame(rows)

# Save the mismatched cases to a CSV file
output_csv_path = os.path.join(folder, 'mismatched_cases.csv')
df.to_csv(output_csv_path, index=False)

print(f"Mismatched cases CSV saved as {output_csv_path}")


In [None]:
from collections import Counter

# Assuming 'rows' is the list of mismatched cases generated previously

# Create counters for missing and extra ROR IDs
missing_counter = Counter()
extra_counter = Counter()

for row in rows:
    missing_counter.update(row['Missing ROR IDs'])
    extra_counter.update(row['Extra ROR IDs'])

# Get the top 5 most common missing and extra ROR IDs
most_common_missing = missing_counter.most_common(5)
most_common_extra = extra_counter.most_common(5)

print("Most frequent missing ROR IDs (Top 5):")
for ror_id, count in most_common_missing:
    print(f"ROR ID: {ror_id}, Count: {count}")

print("\nMost frequent extra ROR IDs (Top 5):")
for ror_id, count in most_common_extra:
    print(f"ROR ID: {ror_id}, Count: {count}")
