#### 1. Get the Title - Key poistive pairs

In [None]:
import requests
import json
import time

def fetch_and_store_data(url, output_file):
    # Fetch data from the URL
    response = requests.get(url)
    data = response.json()

    # Extract relevant information and store in the desired format
    results = data.get("results", [])
    formatted_data = []

    for result in results: 
        title = result.get("title", "")
        keywords = result.get("keywords", [])

        if keywords:
            # Extracting the top-scored keyword text
            top_keyword = keywords[0].get("keyword", "")
            # Creating the structure {"texts": ["title", "key"]}
            formatted_entry = {"texts": [title, top_keyword]}
            formatted_data.append(formatted_entry)

    # Write to a .jsonl file
    with open(output_file, "a") as jsonl_file:
        for entry in formatted_data:
            jsonl_file.write(json.dumps(entry) + "\n")

        # Delay for 1 second before the next API call due to rate limit
        # time.sleep(delay) 

if __name__ == "__main__":
    # Specify the URL and output file
    # api_url = "https://api.openalex.org/works?mailto=brookshum24@gmail.com&sample=10000&seed=3&select=title,keywords"
    # Initialize counters
    current_page = 1
    total_entries_count = 0
    target_count = 10000
    # Specify the URL and output file
    output_jsonl_file = "title_key.jsonl"

    # Loop until the target count is reached
    while total_entries_count < target_count: 
        api_url = f"https://api.openalex.org/works?mailto=brookshum24@gmail.com&page={current_page}&per_page=25&select=title,keywords"
        # Fetch and store the data
        fetch_and_store_data(api_url, output_jsonl_file)
        current_page +=1
        total_entries_count += 25
    print(f"Total entries written to {output_jsonl_file}: {total_entries_count}")

#### 2. Get the Title - Abstract positive pairs

In [7]:
import requests
import json
import time

def fetch_and_store_data(url, output_file):
    # Fetch data from the URL
    response = requests.get(url)
    data = response.json()

    # Extract relevant information and store in the desired format
    results = data.get("results", [])
    formatted_data = []
    
    for result in results:
        title = result.get("title", "")
        abstract_index = result.get("abstract_inverted_index")
        
        # Skip processing if abstract_inverted_index is None
        if abstract_index is None:
            continue
       
        # Concatenate all keys from the abstract_inverted_index using spaces
        abstract_text = " ".join(key for key in abstract_index.keys())

        # Creating the structure {"texts": ["title", "abstract"]}
        formatted_entry = {"texts": [title, abstract_text]}
        formatted_data.append(formatted_entry)

    # Write to a .jsonl file
    with open(output_file, "a") as jsonl_file:
        for entry in formatted_data:
            jsonl_file.write(json.dumps(entry) + "\n") 

if __name__ == "__main__":
    # Initialize counters
    current_page = 1
    total_entries_count = 0
    target_count = 10000
    # Specify the URL and output file
    output_jsonl_file = "title_abstract.jsonl"

    # Loop until the target count is reached
    while total_entries_count < target_count: 
        api_url = f"https://api.openalex.org/works?mailto=brookshum24@gmail.com&page={current_page}&per_page=25&select=title,abstract_inverted_index"
        # Fetch and store the data
        fetch_and_store_data(api_url, output_jsonl_file)
        current_page +=1
        total_entries_count += 25
    print(f"Total entries written to {output_jsonl_file}: {total_entries_count}")

Total entries written to title_abstract.jsonl: 10000


#### 3. Get the Expanded Referenced , Related and Ngrams from Work,Object and save as a dataframe [Max 3 values]

##### Import necessary packages

##### SQLite for in-browser Database

In [1]:
! pip install db-sqlite3



##### [*Voyage AI*](https://docs.voyageai.com/docs/introduction) provides cutting-edge embedding/vectorizations models. It is first in clustering task currently on the MTEB board.

In [2]:
!pip install voyageai



#### Fetch 20k sample and Extract the expanded form of Referenced , Related and Ngrams of the Work Object

In [3]:
import requests
import json
import pandas as pd

# Function to fetch and process referenced works
def fetch_extract_works(work_id):
    # Initialize an empty list to store referenced work titles
    referenced_work_titles = []
    related_work_titles = []
    ngrams_list = []

    # Fetch the work details
    work_url = f"https://api.openalex.org/works/{work_id}"
    response = requests.get(work_url)

    try:
        response.raise_for_status()
        work_details = response.json()
    except Exception as e:
        return None, None, None, None

    work_title = work_details.get("title")

    # Extract and process referenced works
    referenced_works = work_details.get("referenced_works", [])[:3]  # Limit to the first 3 references
    for referenced_work_url in referenced_works:
        # Extract the last section of the work ID from the URL
        last_section = referenced_work_url.rsplit("/", 1)[-1]

        # Construct the referenced_work_api_url
        referenced_work_api_url = f"https://api.openalex.org/works/{last_section}"

        # Fetch details of the referenced work
        referenced_work_response = requests.get(referenced_work_api_url)

        try:
            referenced_work_response.raise_for_status()
            referenced_work_details = referenced_work_response.json()
        except Exception as e:
            referenced_work_titles.append("")
            continue

        # Extract title as a string and add to the list
        referenced_work_title = str(referenced_work_details.get("title", ""))
        referenced_work_titles.append(referenced_work_title)

    # Extract and process related works
    related_works = work_details.get("related_works", [])[:3]  # Limit to the first 3 related works
    for related_work_url in related_works:
        # Extract the last section of the work ID from the URL
        last_section = related_work_url.rsplit("/", 1)[-1]

        # Construct the related_work_api_url
        related_work_api_url = f"https://api.openalex.org/works/{last_section}"

        # Fetch details of the related work
        related_work_response = requests.get(related_work_api_url)

        try:
            related_work_response.raise_for_status()
            related_work_details = related_work_response.json()
        except Exception as e:
            related_work_titles.append("")
            continue

        # Extract title as a string and add to the list
        related_work_title = str(related_work_details.get("title", ""))
        related_work_titles.append(related_work_title)

    # Fetch ngrams data
    ngrams_url = f"https://api.openalex.org/works/{work_id}/ngrams"
    ngrams_response = requests.get(ngrams_url)

    try:
        ngrams_response.raise_for_status()
        ngrams_data = ngrams_response.json().get("ngrams", [])
    except Exception as e:
        ngrams_list.extend([""] * 3)
        ngrams_data = []

    # Check if ngrams_data is not empty before sorting
    if ngrams_data:
        # Sort ngrams_data by term_frequency in descending order
        ngrams_data.sort(key=lambda x: x.get("term_frequency", 0), reverse=True)

        # Select only the top 3 ngrams
        selected_ngrams = [ngram.get("ngram", "") for ngram in ngrams_data[:3]]
        ngrams_list.extend(selected_ngrams)

    return work_title, referenced_work_titles, related_work_titles, ngrams_list

# Create an empty DataFrame
# Create an empty DataFrame with the required columns
columns = ["work_id", "title", "referenced_works", "related_work_titles", "ngrams", "nearest_related_work", "nearest_referenced_work", "nearest_ngrams_work"]
wrangled_work_df = pd.DataFrame(columns=columns)

# Initialize cursor for pagination
cursor = "*"

max_data_cap = 20000

# Fetch 20k results using cursor pagination , no seeding for randomness in subsequent fetches
while len(wrangled_work_df) < max_data_cap:
    # Fetch the sample work IDs with cursor
    works_url = f"https://api.openalex.org/works?mailto=brookshum24@gmail.com&sample=10000&select=id&cursor={cursor}"
    works_response = requests.get(works_url)
    
    try:
        works_response.raise_for_status()
        works_data = works_response.json()
    except Exception as e:
        print(e)
        continue

    # Extract IDs from the results list
    work_ids = [work.get("id", "").rsplit("/", 1)[-1] for work in works_data.get("results", [])]
    
    print(f"Number of Work Objects to use: {len(work_ids)}")

    # Fetch and process referenced, related works, and ngrams for each work ID
    for work_id in work_ids:
        work_title, referenced_work_titles, related_work_titles, ngrams_list = fetch_extract_works(work_id)

        # Add a row to the DataFrame
        wrangled_work_df = pd.concat([wrangled_work_df, pd.DataFrame({
            "work_id": [work_id],
            "title": [work_title],
            "referenced_works": [referenced_work_titles],
            "related_work_titles": [related_work_titles],
            "ngrams": [ngrams_list]
        })], ignore_index=True)
        
        # Add a progress bar
        print(f"Dataframe Progress: Added {len(wrangled_work_df)} out of {max_data_cap}")

Number of Work Objects to use: 25
Dataframe Progress: Added 1 out of 20000
Dataframe Progress: Added 2 out of 20000
Dataframe Progress: Added 3 out of 20000
Dataframe Progress: Added 4 out of 20000
Dataframe Progress: Added 5 out of 20000
Dataframe Progress: Added 6 out of 20000
Dataframe Progress: Added 7 out of 20000
Dataframe Progress: Added 8 out of 20000
Dataframe Progress: Added 9 out of 20000
Dataframe Progress: Added 10 out of 20000
Dataframe Progress: Added 11 out of 20000
Dataframe Progress: Added 12 out of 20000
Dataframe Progress: Added 13 out of 20000
Dataframe Progress: Added 14 out of 20000
Dataframe Progress: Added 15 out of 20000
Dataframe Progress: Added 16 out of 20000
Dataframe Progress: Added 17 out of 20000
Dataframe Progress: Added 18 out of 20000
Dataframe Progress: Added 19 out of 20000
Dataframe Progress: Added 20 out of 20000
Dataframe Progress: Added 21 out of 20000
Dataframe Progress: Added 22 out of 20000
Dataframe Progress: Added 23 out of 20000
Dataframe

#### Checks it is in proper form 

In [4]:
len(wrangled_work_df)

20000

#### Store the values in Sqlite3 Database (to avoid hitting the Rate limit and redundant API calls)

In [5]:
import sqlite3

# Convert list columns to JSON strings
wrangled_work_df['referenced_works'] = wrangled_work_df['referenced_works'].apply(json.dumps)
wrangled_work_df['related_work_titles'] = wrangled_work_df['related_work_titles'].apply(json.dumps)
wrangled_work_df['ngrams'] = wrangled_work_df['ngrams'].apply(json.dumps)

wrangled_work_df['nearest_related_work'] = wrangled_work_df['nearest_related_work'].apply(json.dumps)
wrangled_work_df['nearest_referenced_work'] = wrangled_work_df['nearest_referenced_work'].apply(json.dumps)
wrangled_work_df['nearest_ngrams_work'] = wrangled_work_df['nearest_ngrams_work'].apply(json.dumps)



# Connect to SQLite
works_db = sqlite3.connect("works.db")

# Convert DataFrame to SQLite
wrangled_work_df.to_sql('works_table', works_db, index=False, if_exists='replace')

# Save (commit) the changes
works_db.commit()

# Close the connection
works_db.close()

### Use the Voyage API to Cluster and find Top-k values

##### Set the environment

In [6]:
import os
import voyageai
os.environ['VOYAGE_API_KEY'] = "pa-k2_wb1Mj37_Ppl1FFBmCvu-ybdIKZzels0GeMF7PnUI"
vo = voyageai.Client(api_key=os.environ.get("VOYAGE_API_KEY"),)

**Nearest neighbor Search:**   
##### We can find a few closest embeddings in the documents embeddings based on the cosine similarity, 
##### and retrieve the corresponding document using the nearest_neighbors function.

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def k_nearest_neighbors(query_embedding, documents_embeddings, k=5):
  query_embedding = np.array(query_embedding) # convert to numpy array
  documents_embeddings = np.array(documents_embeddings) # convert to numpy array

  # Reshape the query vector embedding to a matrix of shape (1, n) to make it compatible with cosine_similarity
  query_embedding = query_embedding.reshape(1, -1)

  # Calculate the similarity for each item in data
  cosine_sim = cosine_similarity(query_embedding, documents_embeddings)

  # Sort the data by similarity in descending order and take the top k items
  sorted_indices = np.argsort(cosine_sim[0])[::-1]

  # Take the top k related embeddings
  top_k_related_indices = sorted_indices[:k]
  top_k_related_embeddings = documents_embeddings[sorted_indices[:k]]
  top_k_related_embeddings = [list(row[:]) for row in top_k_related_embeddings] # convert to list

  return top_k_related_embeddings, top_k_related_indices

##### perform the **$k$-nearest neighbors Search ($k$-NN)** the nearest/closest query , document pair.
##### I will be using the **voyage-lite-02-instruct** model since this version is the best performing model for clustering tasks.

In [17]:
import sqlite3
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import time

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query
query = "SELECT * FROM works_table"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Iterate over the results
for index, result in enumerate(results):
    try:
        (work_id,
         title,
         referenced_works_json,
         related_work_titles_json,
         ngrams_json,
         nearest_referenced_work_json,
         nearest_related_work_json,
         nearest_ngrams_work_json) = result

        # Parse JSON strings back to Python objects
        referenced_works = json.loads(referenced_works_json)
        related_work_titles = json.loads(related_work_titles_json)
        ngrams = json.loads(ngrams_json)
        
        

        # Lists to store retrieved documents
        nearest_referenced_work = ""
        nearest_related_work = ""
        nearest_ngrams_work = ""

        # Embed the title
        title_embedding = vo.embed([title], model="voyage-lite-02-instruct", input_type="query").embeddings[0]

        # Store the referenced_works for processing
        if referenced_works:
            # Store the referenced for processing
            referenced_documents = referenced_works

            # Embed the referenced
            referenced_documents_embeddings = vo.embed(referenced_documents, model="voyage-lite-02-instruct", input_type="document").embeddings

            # Find the nearest neighbor and store in the nearest referenced work
            retrieved_embds, retrieved_embd_indices = k_nearest_neighbors(title_embedding, referenced_documents_embeddings, k=1)
            nearest_referenced_work = referenced_documents[retrieved_embd_indices[0]]

        if related_work_titles:
            # Store the related_works for processing
            related_documents = related_work_titles

            # Embed the related_works
            related_documents_embeddings = vo.embed(related_documents, model="voyage-lite-02-instruct", input_type="document").embeddings

            # Find the nearest neighbor and store in the nearest related work
            retrieved_embds, retrieved_embd_indices = k_nearest_neighbors(title_embedding, related_documents_embeddings, k=1)
            nearest_related_work = related_documents[retrieved_embd_indices[0]]

        # Embed the ngrams if the list has values
        if ngrams:
            # Store the ngrams_works for processing
            ngrams_documents = ngrams

            # Embed the ngrams_works
            ngrams_documents_embeddings = vo.embed(ngrams_documents, model="voyage-lite-02-instruct", input_type="document").embeddings

            # Find the nearest neighbor and store in the nearest ngram work
            retrieved_embds, retrieved_embd_indices = k_nearest_neighbors(title_embedding, ngrams_documents_embeddings, k=1)
            nearest_ngrams_work = ngrams_documents[retrieved_embd_indices[0]]

        # Update the SQLite database with the retrieved documents
        update_query = f"UPDATE works_table SET nearest_related_work = ?, nearest_referenced_work = ?, nearest_ngrams_work = ? WHERE work_id = ?"
        work_cursor.execute(update_query, (nearest_related_work, nearest_referenced_work, nearest_ngrams_work, work_id))
        works_db.commit()

    except Exception as e:
        print(f"Error processing work_id {work_id}: {e}")
        continue

    finally:
        # Show the progress of Nearest Neighbor extraction process
        print(f"Extraction progress at {index + 1} out of {len(results)}")

        # Added a sleep function for 60 seconds every time index is a multiple of 300, because of Rate Limit caution
        if (index + 1) % 300 == 0:
            print(f"Sleeping for 60 seconds at index {index + 1}...")
            # time.sleep(60) # commented out for testing purposes

# Close the cursor and connection
work_cursor.close()
works_db.close()

Extraction progress at 1 out of 20000
Extraction progress at 2 out of 20000
Extraction progress at 3 out of 20000
Extraction progress at 4 out of 20000
Extraction progress at 5 out of 20000
Extraction progress at 6 out of 20000
Extraction progress at 7 out of 20000
Extraction progress at 8 out of 20000
Extraction progress at 9 out of 20000
Extraction progress at 10 out of 20000
Extraction progress at 11 out of 20000
Extraction progress at 12 out of 20000
Extraction progress at 13 out of 20000
Extraction progress at 14 out of 20000
Extraction progress at 15 out of 20000
Extraction progress at 16 out of 20000
Extraction progress at 17 out of 20000
Extraction progress at 18 out of 20000
Extraction progress at 19 out of 20000
Extraction progress at 20 out of 20000
Extraction progress at 21 out of 20000
Extraction progress at 22 out of 20000
Extraction progress at 23 out of 20000
Extraction progress at 24 out of 20000
Extraction progress at 25 out of 20000
Extraction progress at 26 out of 2

#### Extracting the Empty Nearest Neighbors. Since calculations did not happen in this rows

In [8]:
import sqlite3

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute the SELECT query to get the count
query = "SELECT COUNT(*) FROM works_table WHERE nearest_related_work = '' AND nearest_referenced_work = '' AND nearest_ngrams_work = ''"
work_cursor.execute(query)

# Fetch the result
result = work_cursor.fetchone()

# Display the count
print(f"Count of rows where all three fields are empty: {result[0]}")

# Close the cursor and connection
work_cursor.close()
works_db.close()

Count of rows where all three fields are empty: 4052


#### Delete the Empty rows since they are not useful for our usecase 

In [10]:
import sqlite3

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute the DELETE query to delete rows with empty values
delete_query = "DELETE FROM works_table WHERE nearest_related_work = '' AND nearest_referenced_work = '' AND nearest_ngrams_work = ''"
work_cursor.execute(delete_query)

# Commit the changes
works_db.commit()

# Display the number of rows deleted
print(f"Number of rows deleted: {work_cursor.rowcount}")

# Close the cursor and connection
work_cursor.close()
works_db.close()

Number of rows deleted: 4052


#### **Make a training pair** using the column values from the table. Using the calculated 15948 rows.

#### Title Refereced Work JSON

In [13]:
import json
import sqlite3

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query only the rows having value
query = "SELECT * FROM works_table WHERE length(nearest_referenced_work) > 0"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Define the path for the JSONL file
jsonl_file_path = "title_referenced_work.jsonl"

# Open the JSONL file in write mode
with open(jsonl_file_path, "w") as jsonl_file:
    # Iterate over the results
    for result in results:
        work_id, title, referenced_works_json, related_work_titles_json, ngrams_json, \
        nearest_related_work, nearest_referenced_work, nearest_ngrams_work = result

        # Extract values for JSONL format
        anchor_text = title
        positive_text = nearest_referenced_work

        # Create a dictionary for positive pairs
        positive_pairs = {"texts": [anchor_text, positive_text]}

        # Write the positive pairs dictionary as a JSON string to the JSONL file
        jsonl_file.write(json.dumps(positive_pairs) + "\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

#### Title Related Work JSON

In [15]:
import json
import sqlite3

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query only the rows having value and limiting the number to 10k 
# Since we have a lot of related work data 
query = "SELECT * FROM works_table WHERE length(nearest_related_work) > 0 LIMIT 10000"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Define the path for the JSONL file
jsonl_file_path = "title_related_work.jsonl"

# Open the JSONL file in write mode
with open(jsonl_file_path, "w") as jsonl_file:
    # Iterate over the results
    for result in results:
        work_id, title, referenced_works_json, related_work_titles_json, ngrams_json, \
        nearest_related_work, nearest_referenced_work, nearest_ngrams_work = result

        # Extract values for JSONL format
        anchor_text = title
        positive_text = nearest_related_work

        # Create a dictionary for positive pairs
        positive_pairs = {"texts": [anchor_text, positive_text]}

        # Write the positive pairs dictionary as a JSON string to the JSONL file
        jsonl_file.write(json.dumps(positive_pairs) + "\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

#### Title Ngrams Work JSON

In [16]:
import json
import sqlite3

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query only the rows having value
query = "SELECT * FROM works_table WHERE length(nearest_ngrams_work) > 0"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Define the path for the JSONL file
jsonl_file_path = "title_ngrams_work.jsonl"

# Open the JSONL file in write mode
with open(jsonl_file_path, "w") as jsonl_file:
    # Iterate over the results
    for result in results:
        work_id, title, referenced_works_json, related_work_titles_json, ngrams_json, \
        nearest_related_work, nearest_referenced_work, nearest_ngrams_work = result

        # Extract values for JSONL format
        anchor_text = title
        positive_text = nearest_ngrams_work

        # Create a dictionary for positive pairs
        positive_pairs = {"texts": [anchor_text, positive_text]}

        # Write the positive pairs dictionary as a JSON string to the JSONL file
        jsonl_file.write(json.dumps(positive_pairs) + "\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

### Topics

#### 4. Training pairs for Topic display name and keywords

In [9]:
import requests
import json
import time

def fetch_and_store_data(url, output_file): 
    # Fetch data from the URL
    response = requests.get(url)
    data = response.json()

    # Extract relevant information and store in the desired format
    results = data.get("results", [])
    formatted_data = []

    for result in results:
        display_name = result.get("display_name", "")
        keywords = result.get("keywords", [])
        
        if keywords:
            # Extracting the top-scored keyword text
            top_keyword = keywords[0]        
            # Creating the structure {"texts": ["display_name", "key"]}
            formatted_entry = {"texts": [display_name, top_keyword]}
            formatted_data.append(formatted_entry)

    # Write to a .jsonl file
    with open(output_file, "a") as jsonl_file:
        for entry in formatted_data:
            jsonl_file.write(json.dumps(entry) + "\n")

if __name__ == "__main__":
    # Initialize counters
    current_page = 1
    total_entries_count = 0
    target_count = 10000
    # Specify the URL and output file
    output_jsonl_file = "topic_name_key.jsonl"

    # Loop until the target count is reached
    while total_entries_count < target_count: 
        api_url = f"https://api.openalex.org/topics?mailto=brookshum24@gmail.com&page={current_page}&per_page=25&select=display_name,keywords"
        # Fetch and store the data
        fetch_and_store_data(api_url, output_jsonl_file)
        current_page +=1
        total_entries_count += 25
    print(f"Total entries written to {output_jsonl_file}: {total_entries_count}")

Total entries with keywords: 10000
Total entries written to topic_name_key.jsonl: 10000


#### 5. Triplet pairs on Topics

#### Triplet **display name , domain , field**  *anticipated* = [Soft-negative]

In [10]:
import requests
import json
import time

def fetch_and_store_topics(url, output_file): 
    # Fetch data from the URL
    response = requests.get(url)
    data = response.json()

    # Extract relevant information and store in the desired format
    results = data.get("results", [])
    formatted_data = []

    for result in results:
        display_name = result.get("display_name", "")
        domain = result.get("domain", {}).get("display_name", "")
        field = result.get("field", {}).get("display_name", "")
        
        # Create the triplet {"texts": ["display_name", "domain", "field"]}
        formatted_entry = {"texts": [display_name, domain, field]}
        formatted_data.append(formatted_entry)

    # Write to a .jsonl file
    with open(output_file, "a") as jsonl_file:
        for entry in formatted_data:
            jsonl_file.write(json.dumps(entry) + "\n")

if __name__ == "__main__":    
    # Initialize counters
    current_page = 1
    total_entries_count = 0
    target_count = 10000
    # Specify the URL and output file
    output_jsonl_file = "name_domain_field.jsonl"

    # Loop until the target count is reached
    while total_entries_count < target_count: 
        api_url = f"https://api.openalex.org/topics?mailto=brookshum24@gmail.com&page={current_page}&per_page=25&select=display_name,domain,field"
        
        # Fetch and store the data
        fetch_and_store_data(api_url, output_jsonl_file)
        current_page +=1
        total_entries_count += 25
    print(f"Total entries written to {output_jsonl_file}: {total_entries_count}")

Total entries written to name_domain_field.jsonl: 10000


#### Triplet **display name , domain , sub-field**  *anticipated* = [Hard-negative]

In [11]:
def fetch_and_store_topics(url, output_file): 
    # Fetch data from the URL
    response = requests.get(url)
    data = response.json()

    # Extract relevant information and store in the desired format
    results = data.get("results", [])
    formatted_data = []

    for result in results:
        display_name = result.get("display_name", "")
        domain = result.get("domain", {}).get("display_name", "")
        subfield = result.get("subfield", {}).get("display_name", "")
        
        # Create the triplet {"texts": ["display_name", "domain", "subfield"]}
        formatted_entry = {"texts": [display_name, domain, subfield]}
        formatted_data.append(formatted_entry)

     # Write to a .jsonl file
     with open(output_file, "a") as jsonl_file:
            for entry in formatted_data:
                jsonl_file.write(json.dumps(entry) + "\n") 

if __name__ == "__main__":
    # Initialize counters
    current_page = 1
    total_entries_count = 0
    target_count = 10000
    # Specify the URL and output file
    output_jsonl_file = "name_domain_subfield.jsonl"


    # Loop until the target count is reached
    while total_entries_count < target_count: 
        api_url = f"https://api.openalex.org/topics?mailto=brookshum24@gmail.com&page={current_page}&per_page=25&select=display_name,domain,subfield"
        
        # Fetch and store the data
        fetch_and_store_data(api_url, output_jsonl_file)
        current_page +=1
        total_entries_count += 25
    print(f"Total entries written to {output_jsonl_file}: {total_entries_count}")

Total entries written to name_domain_subfield.jsonl: 10000


#### Triplet **display name , field, sub-field**  *anticipated* = [Hard-negative]

In [2]:
def fetch_and_store_topics(url, output_file): 
    # Fetch data from the URL
    response = requests.get(url)
    data = response.json()
    # Extract relevant information and store in the desired format
    results = data.get("results", [])
    formatted_data = []

    for result in results:
        display_name = result.get("display_name", "")
        field = result.get("field", {}).get("display_name", "")
        subfield = result.get("subfield", {}).get("display_name", "")

        # Create the triplet {"texts": ["display_name", "field", "subfield"]}
        formatted_entry = {"texts": [display_name, field, subfield]}
        formatted_data.append(formatted_entry)

    # Write to a .jsonl file
    with open(output_file, "a") as jsonl_file:
        for entry in formatted_data:
            jsonl_file.write(json.dumps(entry) + "\n") 

if __name__ == "__main__":
    # Initialize counters
    current_page = 1
    total_entries_count = 0
    target_count = 10000
    # Specify the URL and output file
    output_jsonl_file = "name_field_subfield.jsonl"



    # Loop until the target count is reached
    while total_entries_count < target_count: 
        api_url = f"https://api.openalex.org/topics?mailto=brookshum24@gmail.com&page={current_page}&per_page=25&select=display_name,field, subfield"
        
        # Fetch and store the data
        fetch_and_store_data(api_url, output_jsonl_file)
        current_page +=1
        total_entries_count += 25
    print(f"Total entries written to {output_jsonl_file}: {total_entries_count}")

Total entries written to name_field_subfield.jsonl: 10000
