In [6]:
from importlib import reload
from openalex_matching import person_match
from collections import defaultdict
from openalex_matching import csv_handler
reload(csv_handler)


<module 'openalex_matching.csv_handler' from '/Users/bryanyuk/anaconda3/lib/python3.11/site-packages/openalex_matching/csv_handler.py'>

In [12]:
# Step 2: Define your updated version of name_csv_reader
import pandas as pd
def updated_name_csv_reader(inputFileName, columnName):
    try:
        # Check if file has a .csv extension
        if not inputFileName.lower().endswith(".csv"):
            raise ValueError("Invalid input file format, only CSV files are acceptable")
        
        # Read the CSV file
        df = pd.read_csv(inputFileName)

        # Check if the specified column exists in the CSV
        if columnName in df.columns:
            namesArray = df[columnName].to_numpy()
        else:
            raise ValueError(f"'{columnName}' column not found in input CSV file")
        
        return namesArray
    
    except FileNotFoundError:
        print(f"File '{inputFileName}' not found")
    except Exception as e:
        print(f"An error occurred: {e}")

# Step 3: Monkey patch the function in the module
csv_handler.name_csv_reader = updated_name_csv_reader

# OpenAlex API Search Strategy

# Strategy: 
1. Find the OpenAlex Institution ID:
    - First, search for the OpenAlex ID of the desired institution using the institution's name.

2. Search for Authors by Name and Institution:
    - Next, search for authors by their display name, filtering results by the institution's OpenAlex ID. If no results are found:
        - Search through nick names associated with author's first name
        - If the input name consists of two capital letters (indicating initials), attempt an initial-based search.
        - If that still returns no results, try searching by first initial and last name.
    - The search will return one of three enumerated types:
        - EXACT_NAME: Indicates an exact match with the full name.
        - FIRST_MIDDLE_INITIAL: Indicates a match using first and middle initials.
        - FIRST_INITIAL: Indicates a match using only the first initial.
    - The filtering method will adjust based on the type of search used.
3. Filter and Select the Best Matched Author:
    - Depending on the type of search used, filter the list of matched OpenAlex IDs:
        - Exact Name Search (EXACT_NAME): Use fuzzy string matching to eliminate names that deviate too much from the inputted author name. Select the author with the highest fuzzy match score. In case of a tie in fuzzy match scores, choose the author with the higher citation count.
        - Initial Search (FIRST_MIDDLE_INITIAL or FIRST_INITIAL): Ensure that the first initial of the inputted author name matches the first initial of the returned author names. Filter out any mismatches. Then, select the most highly cited author.
        - For searches using only one initial (FIRST_INITIAL), apply a stricter fuzzy matching threshold since these searches are more prone to errors. In case of a tie, select the author with more citations.


In [2]:
#Step One
university_name = 'University of Virginia'
university_id = person_match.institution_id_openalex(university_name)
print(university_id)

I51556381


In [3]:
#Step Two
person_name = "Ray Balkrishnan"
list_of_ids, type_search_conducted, match_person_name = person_match.list_person_ids_openalex(person_name, university_id)
print(list_of_ids)




['A5015605411', 'A5111842581', 'A5029765901']


In [4]:
#Step Three
openalex_id = person_match.choose_person(list_of_ids, match_person_name, university_id, type_search_conducted)
print(openalex_id)

A5029765901


## Example inputting and outputting CSV files

In [None]:
university_name='university of virginia'
university_id = person_match.institution_id_openalex(university_name)
fileName = 'OpenAlex Names.csv'
namesArray = csv_handler.name_csv_reader(fileName, 'Names') # Reading all names from CSV file, appending to array
print(namesArray)
IDArray = [] # Array that will contain ids of corresponding authors
dataDict = defaultdict(list) # Dictionary that will be written to output CSV file

#Iterating through names array, finding corresponding OpenAlex ID for each author
for name in namesArray: 
    person_ids, type_search_conducted, match_person_name = person_match.list_person_ids_openalex(name, university_id)
    selectID = person_match.choose_person(person_ids, name, university_id, type_search_conducted)
    IDArray.append(selectID)

dataDict["Names"] = namesArray #Creating Names column for CSV file
dataDict["OpenAlexID"] = IDArray #Creating corresonding IDs column for CSV file
csv_handler.name_csv_writer("output.csv", dataDict)

['Gabrielle Adams' 'Nafisa Ahmed' 'Sonia Alconini' 'Negin Alemazkoor'
 'Harsh Anand' 'Elizabeth Andrews' 'Anthony Artuso' 'Mary Asare-Ado'
 'Ehsan Baharlou' 'Teagan Baiotto' 'Cora Baird' 'Prasanna Balachandran'
 'Ray Balkrishnan' 'Lawrence Band' 'Ellen Bassett' 'Tim Beatley'
 'Jeffrey Bennett' 'Peter Berg' 'Emily Bernhardt' 'Alice Besterman'
 'Lori Bird' 'Shannon Blevins' 'Swatah Borkotoky' 'Hanne Borstlap'
 'Allison Bradshaw' 'Jeanine Braithwaite' 'JD Brown' 'Kelly Bulin'
 'Mark Buntaine' 'Matthew Burtner' 'Dawn Byrd' 'Liheng Cai'
 'Brad Campbell' 'Jonathan Cannon' 'Brad Cantrell' 'Kerrie Carfagno'
 'Ann Carlton' 'Kathy Carmody' 'Sergio Casas' 'Max Castorani' 'Dong Chen'
 'Donna Chen' 'Leena Cho' 'Michele Claibourn' 'Andres Clarens'
 'Tanya Cobb' 'Ronald Cohen' 'Jonathan Colmer' 'Lisa Colosi'
 'Josh Colston' 'Ben Converse' 'Nina Copeland' 'Sheila Crane'
 'Phoebe Crisman' 'Teresa Culver' "Paolo D'Odorico" 'Robert Davis'
 'Brian Davis' 'Peter Debaere' 'Cahterine Debbas' 'Pam DeGuzman'
 