In [13]:
import pandas as pd

# Load the datasets
scopus_data = pd.read_csv('data/2201.00_scopus_931.csv')
ror_mapping = pd.read_csv('matching_data/matched_results_fuzzy.csv')

# Merge the datasets on 'Primary Org Id'
merged_data = pd.merge(
    scopus_data,
    ror_mapping,
    on='Primary Org Id',
    how='left'  # Use 'left' to keep all rows from scopus_data even if no match is found
)

# Calculate the number of unmatched rows
unmatched_count = merged_data['ROR ID'].isna().sum()

print(f"Number of unmatched 'Primary Org Id' values: {unmatched_count}")
print(1-unmatched_count/len(merged_data))
# Save the merged data to a new CSV file
merged_data.to_csv('data/2201.00_scopus_931_with_ror.csv', index=False)

print("Matching completed. The output is saved as 'data/2201.00_scopus_931_with_ror.csv'.")

Number of unmatched 'Primary Org Id' values: 24
0.9930090300029129
Matching completed. The output is saved as 'data/2201.00_scopus_931_with_ror.csv'.


In [3]:
import os
import pandas as pd

# Paths
csv_path = 'data/2201.00_scopus_931_with_ror.csv'
text_folder = 'data/2201_00_text'

# Read the CSV file
df = pd.read_csv(csv_path)

# Assuming the column containing ArXiv IDs is named 'ArXiv Id'
arxiv_ids = df['ArXiv Id'].dropna().unique()  # Drop NaN values and get unique IDs

# Initialize counters and a list to store missing files
found_count = 0
total_count = len(arxiv_ids)
missing_files = []

# Check for each ArXiv ID
for arxiv_id in arxiv_ids:
    txt_file_path = os.path.join(text_folder, f"{arxiv_id}.txt")
    if os.path.isfile(txt_file_path):
        found_count += 1
    else:
        missing_files.append(arxiv_id)

# Calculate the percentage
percentage = (found_count / total_count) * 100

# Display results
print(f"Total ArXiv IDs: {total_count}")
print(f"Found text files: {found_count}")
print(f"Percentage: {percentage:.2f}%")
print("\nMissing files:")
for missing_id in missing_files:
    print(missing_id)

Total ArXiv IDs: 931
Found text files: 931
Percentage: 100.00%

Missing files:


In [1]:
import pandas as pd

# Load the CSV file
file_path = 'data/2201.00_scopus_931_with_ror.csv'
df = pd.read_csv(file_path)

# Select the desired columns
columns_to_keep = ['Primary Org Id', 'Primary Org Name_x', 'ArXiv Id', 'ROR ID']
df_selected = df[columns_to_keep]

# Save the preprocessed data to a new CSV file
output_file_path = 'data/preprocessed_scopus_data.csv'
df_selected.to_csv(output_file_path, index=False)

print(f"Preprocessed data saved to {output_file_path}")

Preprocessed data saved to data/preprocessed_scopus_data.csv


In [4]:
import pandas as pd
from extractors.trie_extractor import TrieExtractor
from utils.file_reader import read_file
from tqdm import tqdm  # Import tqdm for progress bar

# Load the CSV file
csv_path = "data/2201.00_scopus_931_with_ror.csv"
df = pd.read_csv(csv_path)

# Initialize the TrieExtractor
extractor = TrieExtractor(data_path="data/1.34_extracted_ror_data.csv", common_words_path="data/common_english_words.txt")

In [7]:
# Function to extract ROR ID from a given text file
def extract_ror_id(paper_id):
    file_path = f"data/2201_00_text/{paper_id}.txt"
    try:
        text = read_file(file_path)
        affiliations = extractor.extract_affiliations(text)
        # Assuming the first affiliation's ROR ID is the one we want
        return affiliations if affiliations else None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None

# Apply the extraction function to each paper_id with tqdm progress bar
tqdm.pandas()  # Enable tqdm for pandas
df['Extracted ROR ID'] = df['ArXiv Id'].progress_apply(extract_ror_id)

# Save the updated DataFrame back to CSV
df.to_csv(csv_path, index=False)

print("Extraction complete. Updated CSV saved.")

100%|██████████| 3433/3433 [00:09<00:00, 354.55it/s]

Extraction complete. Updated CSV saved.





In [9]:
import pandas as pd

# Step 1: Read the CSV file
df = pd.read_csv('data/2201.00_scopus_931_with_ror.csv')

# Function to preprocess Extracted ROR ID
def preprocess_extracted_ror_id(extracted_ror_id):
    if pd.isna(extracted_ror_id):  # Skip NaN values
        return set()
    # Remove curly braces and split by comma
    extracted_ror_id = extracted_ror_id.strip("{}").split(",")
    # Remove any leading/trailing whitespace and filter out empty strings
    return set(item.strip().strip("''") for item in extracted_ror_id if item.strip())

# Step 2: Group by ArXiv ID and merge ROR ID and Extracted ROR ID into sets
grouped = df.groupby('ArXiv Id').agg({
    'ROR ID': lambda x: set(x.dropna()),  # Create a set of ROR IDs
    'Extracted ROR ID': lambda x: set().union(*x.apply(preprocess_extracted_ror_id))  # Preprocess and union Extracted ROR IDs
}).reset_index()

# Step 3: Calculate accuracy and wrong extraction rate
correct_extractions = 0
wrong_extractions = 0
total_ror_ids = 0

for index, row in grouped.iterrows():
    ror_ids = row['ROR ID']
    extracted_ror_ids = row['Extracted ROR ID']
    if ror_ids:  # Only consider rows with non-empty ROR ID
        total_ror_ids += len(ror_ids)
        correct_extractions += len(ror_ids.intersection(extracted_ror_ids))
        wrong_extractions += len(extracted_ror_ids - ror_ids)  # IDs in extraction but not in ground truth

# Calculate accuracy
accuracy = correct_extractions / total_ror_ids if total_ror_ids > 0 else 0

# Calculate wrong extraction rate
wrong_extraction_rate = wrong_extractions / total_ror_ids if total_ror_ids > 0 else 0

print(f"Accuracy: {accuracy:.2f}")
print(f"Wrong Extraction Rate: {wrong_extraction_rate:.2f}")

Accuracy: 0.41
Wrong Extraction Rate: 0.17


In [10]:
import pandas as pd

# Step 1: Read the CSV file
df = pd.read_csv('data/2201.00_scopus_931_with_ror.csv')

# Function to preprocess Extracted ROR ID
def preprocess_extracted_ror_id(extracted_ror_id):
    if pd.isna(extracted_ror_id):  # Skip NaN values
        return set()
    # Remove curly braces and split by comma
    extracted_ror_id = extracted_ror_id.strip("{}").split(",")
    # Remove any leading/trailing whitespace and filter out empty strings
    return set(item.strip().strip("''") for item in extracted_ror_id if item.strip())

# Step 2: Group by ArXiv ID and merge ROR ID and Extracted ROR ID into sets
grouped = df.groupby('ArXiv Id').agg({
    'ROR ID': lambda x: set(x.dropna()),  # Create a set of ROR IDs
    'Extracted ROR ID': lambda x: set().union(*x.apply(preprocess_extracted_ror_id))  # Preprocess and union Extracted ROR IDs
}).reset_index()

# Step 3: Collect cases where extracted ROR IDs do not perfectly match ground truth ROR IDs
mismatched_cases = []

for index, row in grouped.iterrows():
    ror_ids = row['ROR ID']
    extracted_ror_ids = row['Extracted ROR ID']
    if ror_ids != extracted_ror_ids:  # Check for imperfect match
        mismatched_cases.append({
            'ArXiv Id': row['ArXiv Id'],
            'Ground Truth ROR IDs': ror_ids,
            'Extracted ROR IDs': extracted_ror_ids,
            'Missing ROR IDs': ror_ids - extracted_ror_ids,  # IDs in ground truth but not in extraction
            'Extra ROR IDs': extracted_ror_ids - ror_ids  # IDs in extraction but not in ground truth
        })

# Convert the list of mismatched cases to a DataFrame
mismatched_df = pd.DataFrame(mismatched_cases)

# Save the mismatched cases to a CSV file
mismatched_df.to_csv('mismatched_cases.csv', index=False)

print(f"Total mismatched cases: {len(mismatched_df)}")
print("Mismatched cases saved to 'mismatched_cases.csv'")

Total mismatched cases: 693
Mismatched cases saved to 'mismatched_cases.csv'


In [11]:
# Step 4: Calculate the most frequent missing and extra ROR IDs
from collections import Counter

# Flatten the lists of missing and extra ROR IDs
missing_ror_ids = [ror_id for case in mismatched_cases for ror_id in case['Missing ROR IDs']]
extra_ror_ids = [ror_id for case in mismatched_cases for ror_id in case['Extra ROR IDs']]

# Count the frequency of each ROR ID
missing_ror_id_counts = Counter(missing_ror_ids)
extra_ror_id_counts = Counter(extra_ror_ids)

# Get the most common missing and extra ROR IDs
most_common_missing = missing_ror_id_counts.most_common(10)  # Adjust the number as needed
most_common_extra = extra_ror_id_counts.most_common(10)  # Adjust the number as needed

# Print the results
print("Most frequent missing ROR IDs:")
for ror_id, count in most_common_missing:
    print(f"ROR ID: {ror_id}, Count: {count}")

print("\nMost frequent extra ROR IDs:")
for ror_id, count in most_common_extra:
    print(f"ROR ID: {ror_id}, Count: {count}")

Most frequent missing ROR IDs:
ROR ID: https://ror.org/01bj3aw27, Count: 23
ROR ID: https://ror.org/030sjb889, Count: 22
ROR ID: https://ror.org/01hhn8329, Count: 20
ROR ID: https://ror.org/005ta0471, Count: 18
ROR ID: https://ror.org/04kdfz702, Count: 18
ROR ID: https://ror.org/00mmn6b08, Count: 17
ROR ID: https://ror.org/00z54nq84, Count: 17
ROR ID: https://ror.org/01rvn4p91, Count: 17
ROR ID: https://ror.org/03qbxj466, Count: 16
ROR ID: https://ror.org/00s19x989, Count: 15

Most frequent extra ROR IDs:
ROR ID: https://ror.org/013meh722, Count: 37
ROR ID: https://ror.org/00hx57361, Count: 14
ROR ID: https://ror.org/052gg0110, Count: 9
ROR ID: https://ror.org/02eb0rk31, Count: 9
ROR ID: https://ror.org/0257kt353, Count: 9
ROR ID: https://ror.org/01t466c14, Count: 8
ROR ID: https://ror.org/02dqehb95, Count: 7
ROR ID: https://ror.org/04ynn1b95, Count: 6
ROR ID: https://ror.org/04t4pcw17, Count: 6
ROR ID: https://ror.org/05k9pq902, Count: 6
