# OpenAlex Data Pipeline

Author: Alex Davis

Date: 08/26/2025

The purpose of this script is to ingest OSINT data from the OpenAlex API (https://docs.openalex.org/) and save in a vector database it to build an AI Research Chatbot.

## Load Packages

In [0]:
%pip install faiss-cpu
%pip install -qU langchain-community faiss-cpu
%pip install --upgrade --quiet  langchain langchain-huggingface sentence_transformers

In [0]:
%restart_python

In [0]:
#import packages
import pandas as pd
import requests
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

## Import Data

In [0]:
def import_data(pages, start_year, end_year, search_terms):
    
    """
    This function is used to use the OpenAlex API, conduct a search on works, a return a dataframe with associated works.
    
    Inputs: 
        - pages: int, number of pages to loop through
        - search_terms: str, keywords to search for (must be formatted according to OpenAlex standards)
        - start_year and end_year: int, years to set as a range for filtering works
    """
    
    #create an empty dataframe
    search_results = pd.DataFrame()
    
    for page in range(1, pages):
        
        #use paramters to conduct request and format to a dataframe
        response = requests.get(f'https://api.openalex.org/works?page={page}&per-page=200&filter=publication_year:{start_year}-{end_year},type:article&search={search_terms}')
        data = pd.DataFrame(response.json()['results'])
        
        #append to empty dataframe
        search_results = pd.concat([search_results, data])
    
    #subset to relevant features
    search_results = search_results[["id", "title", "display_name", "publication_year", "publication_date",
                                        "type", "countries_distinct_count","institutions_distinct_count",
                                        "has_fulltext", "cited_by_count", "keywords", "referenced_works_count", "abstract_inverted_index"]]
    
    return(search_results)

In [0]:
#search for AI-related research
ai_search = import_data(30, 2018, 2025, "'artificial intelligence' OR 'deep learn' OR 'neural net' OR 'natural language processing' OR 'machine learn' OR 'large language models' OR 'small language models'")

## Preprocess Data

In [0]:
#drop missing abstracts
ai_search = ai_search.dropna(subset=['abstract_inverted_index'])

In [0]:
def undo_inverted_index(inverted_index):
    
    """
    The purpose of the function is to 'undo' and inverted index. It inputs an inverted index and
    returns the original string.
    """

    #create empty lists to store uninverted index
    word_index = []
    words_unindexed = []
    
    #loop through index and return key-value pairs
    for k,v in inverted_index.items(): 
        for index in v: word_index.append([k,index])

    #sort by the index
    word_index = sorted(word_index, key = lambda x : x[1])
    
    #join only the values and flatten
    for pair in word_index:
        words_unindexed.append(pair[0])
    words_unindexed = ' '.join(words_unindexed)
    
    return(words_unindexed)

In [0]:
#create 'original_abstract' feature
ai_search['original_abstract'] = list(map(undo_inverted_index, ai_search['abstract_inverted_index']))

In [0]:
#reset index
ai_search = ai_search.reset_index(drop=True)

## Load Embeddings

In [0]:
#load embedding model
embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

## Create Vector Database

In [0]:
#save index with faiss
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

In [0]:
#create vector store with langchain and faiss
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [0]:
#format abstracts as documents
documents = [Document(page_content=ai_search['original_abstract'][i], metadata={"title": ai_search['title'][i], "year": ai_search['publication_year'][i]}) for i in range(len(ai_search))]

In [0]:
#create list of ids as strings
n = len(ai_search)
ids = list(range(1, n + 1))
ids = [str(x) for x in my_list]

In [0]:
#add documents to vector store
vector_store.add_documents(documents=documents, ids=ids)

## Test and Save Vector Database

In [0]:
#test that vector database is working
vector_store.similarity_search("computer vision", k=3)

In [0]:
vector_store.save_local("Data/faiss_index")