#### 1. Get the Title - Key poistive pairs

In [None]:
import requests
import json
import time

def fetch_and_store_data(url, output_file, target_count=10, delay=1):
    # Initialize counters
    total_keywords_count = 0
    total_entries_count = 0
    page_number = 1

    # Loop until the target count is reached
    while total_entries_count < target_count:
        # Fetch data from the URL
        response = requests.get(url + f"&page={page_number}")
        data = response.json()

        # Extract relevant information and store in the desired format
        results = data.get("results", [])
        formatted_data = []

        for result in results:
            title = result.get("title", "")
            keywords = result.get("keywords", [])

            if keywords:
                # Extracting the top-scored keyword text
                top_keyword = keywords[0].get("keyword", "")

                # Creating the structure {"texts": ["title", "key"]}
                formatted_entry = {"texts": [title, top_keyword]}
                formatted_data.append(formatted_entry)
                total_keywords_count += 1


        # Write to a .jsonl file
        with open(output_file, "a") as jsonl_file:
            for entry in formatted_data:
                jsonl_file.write(json.dumps(entry) + "\n")
                total_entries_count += 1
        page_number += 1
        # Delay for 2 seconds before the next API call due to rate limit
        time.sleep(delay)

    print(f"Total entries with keywords: {total_keywords_count}")
    print(f"Total entries written to {output_file}: {total_entries_count}")

if __name__ == "__main__":
    # Specify the URL and output file
    api_url = "https://api.openalex.org/works?mailto=brookshum24@gmail.com&sample=2&per-page=2&select=title,keywords"
    output_jsonl_file = "title_key.jsonl"

    # Fetch and store the data
    fetch_and_store_data(api_url, output_jsonl_file)

#### 2. Get the Title - Abstract positive pairs

In [None]:
import requests
import json
import time

def fetch_and_store_data(url, output_file, target_count=3, delay=0):
    # Initialize counters
    total_entries_count = 0

    # Loop until the target count is reached
    while total_entries_count < target_count:
        # Fetch data from the URL
        response = requests.get(url)
        data = response.json()
        print("data ->", data)

        # Extract relevant information and store in the desired format
        results = data.get("results", [])
        formatted_data = []

        for result in results:
            title = result.get("title", "")
            abstract_index = result.get("abstract_inverted_index")

            # Skip processing if abstract_inverted_index is None
            if abstract_index is None:
                continue

            # Concatenate all keys from the abstract_inverted_index using spaces
            abstract_text = " ".join(key for key in abstract_index.keys())

            # Creating the structure {"texts": ["title", "abstract"]}
            formatted_entry = {"texts": [title, abstract_text]}
            formatted_data.append(formatted_entry)

        # Write to a .jsonl file
        with open(output_file, "a") as jsonl_file:
            for entry in formatted_data:
                jsonl_file.write(json.dumps(entry) + "\n")
                total_entries_count += 1

        # Delay for specified seconds before the next API call
        time.sleep(delay)

    print(f"Total entries written to {output_file}: {total_entries_count}")

if __name__ == "__main__":
    # Specify the URL and output file
    api_url = "https://api.openalex.org/works?seed=3&sample=3&select=title,abstract_inverted_index"
    output_jsonl_file = "title_abstract.jsonl"

    # Fetch and store the data
    fetch_and_store_data(api_url, output_jsonl_file)

data -> {'meta': {'count': 3, 'db_response_time_ms': 1025, 'page': 1, 'per_page': 25, 'groups_count': None}, 'results': [{'title': 'Minimum Dissipated Power for Linear and Nonlinear Electric Circuits', 'abstract_inverted_index': None}, {'title': 'SunTrust Banks, Inc.', 'abstract_inverted_index': {'Abstract': [0], 'Commercial': [1], 'Banking': [2], '(MIC:': [3], '8.1': [4], 'SIC:': [5], '6021': [6], 'NAIC:': [7], '522110)': [8], 'SunTrust': [9, 19], 'Banks': [10], 'is': [11], 'a': [12], 'financial': [13], 'services': [14, 44], 'holding': [15], 'company.': [16], 'Through': [17, 31], 'its': [18, 32, 68], 'Bank': [20], 'subsidiary,': [21], 'Co.': [22, 35, 48, 66, 90], 'provides': [23, 36], 'deposit,': [24], 'credit,': [25], 'and': [26, 28, 45, 61, 76, 80, 82, 97], 'trust': [27], 'investment': [29], 'services.': [30], 'other': [33], 'subsidiaries,': [34], 'banking,': [37], 'asset': [38], 'management,': [39], "securities'": [40], 'brokerage,': [41], 'capital': [42], 'market': [43], 'credit‐r

#### 3. Get the Expanded Referenced , Related and Ngrams from Work,Object and save as a dataframe [Max 3 values]

In [None]:
import requests
import pandas as pd

# Function to fetch and process referenced works
def fetch_referenced_works(work_id):
    # Initialize lists to store referenced and related work titles, and ngrams
    referenced_work_titles = []
    related_work_titles = []
    ngrams = []

    # Fetch the work details
    work_url = f"https://api.openalex.org/works/{work_id}"
    response = requests.get(work_url)
    work_details = response.json()

    work_title = work_details.get("title")

    # Extract and process referenced works
    referenced_works = work_details.get("referenced_works", [])
    for referenced_work_url in referenced_works:
        # Extract the last section of the work ID from the URL
        last_section = referenced_work_url.rsplit("/", 1)[-1]

        # Construct the referenced_work_api_url
        referenced_work_api_url = f"https://api.openalex.org/works/{last_section}"

        # Fetch details of the referenced work
        referenced_work_response = requests.get(referenced_work_api_url)
        referenced_work_details = referenced_work_response.json()

        # Extract title as a string and add to the list
        referenced_work_title = str(referenced_work_details.get("title", ""))
        referenced_work_titles.append(referenced_work_title)

    # Extract and process related works
    related_works = work_details.get("related_works", [])
    for related_work_url in related_works:
        # Extract the last section of the work ID from the URL
        last_section = related_work_url.rsplit("/", 1)[-1]

        # Construct the related_work_api_url
        related_work_api_url = f"https://api.openalex.org/works/{last_section}"

        # Fetch details of the related work
        related_work_response = requests.get(related_work_api_url)
        related_work_details = related_work_response.json()

        # Extract title as a string and add to the list
        related_work_title = str(related_work_details.get("title", ""))
        related_work_titles.append(related_work_title)

    # Fetch ngrams
    ngrams_url = f"https://api.openalex.org/works/{work_id}/ngrams"
    ngrams_response = requests.get(ngrams_url)
    ngrams_data = ngrams_response.json().get("ngrams", [])

    # Sort ngrams in descending order based on term_frequency
    sorted_ngrams = sorted(ngrams_data, key=lambda x: x.get("term_frequency", 0), reverse=True)

    # Iterate over the sorted ngrams list and select only the top 3 ngrams
    for ngram in sorted_ngrams[:3]:
        ngrams.append(ngram.get("ngram", ""))

    return work_title, referenced_work_titles, related_work_titles, ngrams

# Create an empty DataFrame
# columns for future use added here
columns = ["work_id", "title", "referenced_works", "related_work_titles", "ngrams", "nearest_related_work", "nearest_referenced_work", "nearest_ngrams_work"]
wrangled_work_df = pd.DataFrame(columns=columns)
wrangled_work_df = pd.DataFrame(columns=columns)

# Sample Work ID
work_id = "W2961049121"

# Fetch and process referenced, related works, and ngrams
work_title, referenced_work_titles, related_work_titles, ngrams = fetch_referenced_works(work_id)

# Add a row to the DataFrame
wrangled_work_df = pd.concat([wrangled_work_df, pd.DataFrame({
    "work_id": [work_id],
    "title": [work_title],
    "referenced_works": [referenced_work_titles],
    "related_work_titles": [related_work_titles],
    "ngrams": [ngrams]
})], ignore_index=True)

# Display the DataFrame
print(len(wrangled_work_df))

1


In [None]:
wrangled_work_df

Unnamed: 0,work_id,title,referenced_works,related_work_titles,ngrams,nearest_related_work,nearest_referenced_work,nearest_ngrams_work
0,W2961049121,Shape memory nanocomposite fibers for untether...,[Hierarchically buckled sheath-core fibers for...,[Shape Memory and Superelastic Alloys : Techno...,"[fiber, high, temperature]",,,


In [None]:
wrangled_work_df["ngrams"][0]

['fiber', 'high', 'temperature']



```
# Ngrams with their full representations look like these values.

[{'ngram': 'fiber',
  'ngram_count': 80,
  'ngram_tokens': 1,
  'term_frequency': 0.025502072043353523},
 {'ngram': 'high',
  'ngram_count': 32,
  'ngram_tokens': 1,
  'term_frequency': 0.010200828817341408},
 {'ngram': 'temperature',
  'ngram_count': 31,
  'ngram_tokens': 1,
  'term_frequency': 0.00988205291679949}
]
```



In [None]:
! pip install db-sqlite3

Collecting db-sqlite3
  Downloading db-sqlite3-0.0.1.tar.gz (1.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting db (from db-sqlite3)
  Downloading db-0.1.1.tar.gz (3.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting antiorm (from db->db-sqlite3)
  Downloading antiorm-1.2.1.tar.gz (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: db-sqlite3, db, antiorm
  Building wheel for db-sqlite3 (setup.py) ... [?25l[?25hdone
  Created wheel for db-sqlite3: filename=db_sqlite3-0.0.1-py3-none-any.whl size=1770 sha256=da8fdd8c2bea1084076bf72ee09ac2405153c099c63fce37d69401526e1d81b8
  Stored in directory: /root/.cache/pip/wheels/a6/b7/83/e941e0a0e04f417982e718ae7295d1e82b5f2863a1c51edd71
  Building wheel for db (setup.py) ... [?25l[?25hdone
  Created wheel for db: filename=db-0.1.

In [None]:
wrangled_work_df.dtypes

NameError: name 'wrangled_work_df' is not defined

#### Fetch accordingly using Json load

In [None]:
import sqlite3
import json

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query
query = "SELECT * FROM works_table"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()
# print(results)
# Iterate over the results
for result in results:
    (work_id, title,
     referenced_works_json,
     related_work_titles_json,
     ngrams_json,
     nearest_related_work_json,
     nearest_referenced_work_json,
     nearest_ngrams_work_json)= result

    # Parse JSON strings back to Python objects
    referenced_works = json.loads(referenced_works_json)
    related_work_titles = json.loads(related_work_titles_json)
    ngrams = json.loads(ngrams_json)

    # Now you have the data as lists of strings
    print("Work ID:", work_id)
    print("Title:", title)
    print("Referenced Works:", referenced_works, type(referenced_works))
    print("Related Work Titles:", related_work_titles, type(related_work_titles[0]))
    print("Ngrams:", ngrams)
    print("\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

Work ID: W2961049121
Title: Shape memory nanocomposite fibers for untethered high-energy microengines
Referenced Works: ['Hierarchically buckled sheath-core fibers for superelastic electronics, sensors, and muscles', '8—THE MEASUREMENT OF TORSIONAL RELAXATION IN TEXTILE FIBRES', 'Quantifying the Shape-Memory Effect of Polymers by Cyclic Thermomechanical Tests', 'Fibers Do the Twist', 'A micro rotary actuator using shape memory alloys', 'Scalable process for the spinning of PVA–carbon nanotube composite fibers', 'Shape and Temperature Memory of Nanocomposites with Broadened Glass Transition', 'Bacterial flagellar motor', 'Performance test and improvement of piezoelectric torsional actuators', 'Temperature‐Memory Polymer Networks with Crystallizable Controlling Units', 'Hybrid carbon nanotube yarn artificial muscle inspired by spider dragline silk', 'Differential scanning calorimetry of crystallized PVA hydrogels', 'How polymers lose memory with age', 'Alginate/graphene oxide fibers with

#### **Now** expand the above code to iterate over the batch of sampled data

In [None]:
import requests
import json
import pandas as pd

# Function to fetch and process referenced works
def fetch_referenced_works(work_id):
    # Initialize an empty list to store referenced work titles
    referenced_work_titles = []
    related_work_titles = []
    ngrams_list = []

    # Fetch the work details
    work_url = f"https://api.openalex.org/works/{work_id}"
    response = requests.get(work_url)
    work_details = response.json()

    work_title = work_details.get("title")

    # Extract and process referenced works
    referenced_works = work_details.get("referenced_works", [])[:3]  # Limit to the first 3 references
    for referenced_work_url in referenced_works:
        # Extract the last section of the work ID from the URL
        last_section = referenced_work_url.rsplit("/", 1)[-1]

        # Construct the referenced_work_api_url
        referenced_work_api_url = f"https://api.openalex.org/works/{last_section}"

        # Fetch details of the referenced work
        referenced_work_response = requests.get(referenced_work_api_url)
        referenced_work_details = referenced_work_response.json()

        # Extract title as a string and add to the list
        referenced_work_title = str(referenced_work_details.get("title", ""))
        referenced_work_titles.append(referenced_work_title)

    # Extract and process related works
    related_works = work_details.get("related_works", [])[:3]  # Limit to the first 3 related works
    for related_work_url in related_works:
        # Extract the last section of the work ID from the URL
        last_section = related_work_url.rsplit("/", 1)[-1]

        # Construct the related_work_api_url
        related_work_api_url = f"https://api.openalex.org/works/{last_section}"

        # Fetch details of the related work
        related_work_response = requests.get(related_work_api_url)
        related_work_details = related_work_response.json()

        # Extract title as a string and add to the list
        related_work_title = str(related_work_details.get("title", ""))
        related_work_titles.append(related_work_title)

    # Fetch ngrams data
    ngrams_url = f"https://api.openalex.org/works/{work_id}/ngrams"
    ngrams_response = requests.get(ngrams_url)
    ngrams_data = ngrams_response.json().get("ngrams", [])

    # Sort ngrams_data by term_frequency in descending order
    ngrams_data.sort(key=lambda x: x.get("term_frequency", 0), reverse=True)

    # Select only the top 3 ngrams
    selected_ngrams = [ngram.get("ngram", "") for ngram in ngrams_data[:3]]
    ngrams_list.extend(selected_ngrams)

    return work_title, referenced_work_titles, related_work_titles, ngrams_list

# Create an empty DataFrame
# Create an empty DataFrame with the required columns
columns = ["work_id", "title", "referenced_works", "related_work_titles", "ngrams", "nearest_related_work", "nearest_referenced_work", "nearest_ngrams_work"]
wrangled_work_df = pd.DataFrame(columns=columns)

# Fetch the sampled size Work IDs

# Fetch the sample work IDs
works_url = "https://api.openalex.org/works?sample=10&seed=3&select=id"
works_response = requests.get(works_url)
works_data = works_response.json()

# Extract IDs from the results list
work_ids = [work.get("id", "").rsplit("/", 1)[-1] for work in works_data.get("results", [])]

# work_ids has the form ['W4205227400', 'W2015602200'...]

# Fetch and process referenced, related works, and ngrams for each work ID
for work_id in work_ids:
    work_title, referenced_work_titles, related_work_titles, ngrams_list = fetch_referenced_works(work_id)

    # Add a row to the DataFrame
    wrangled_work_df = pd.concat([wrangled_work_df, pd.DataFrame({
        "work_id": [work_id],
        "title": [work_title],
        "referenced_works": [referenced_work_titles],
        "related_work_titles": [related_work_titles],
        "ngrams": [ngrams_list]
    })], ignore_index=True)

# Display the DataFrame
print(wrangled_work_df)

       work_id                                              title  \
0  W4205227400                               SunTrust Banks, Inc.   
1  W2015602200  A constant rounds group key agreement protocol...   
2  W2952852381  Equity investigation of attitudinal shifts in ...   
3  W2904069587                                Chiffres et lettres   
4  W2345552705  [Chemical control of rust (Uromyces phaseoli v...   
5  W3000345438        Modeling text embedded information cascades   
6  W2008986850  Estimation of mortality coefficients and survi...   
7  W4253524524  Universal Screening of SARS-CoV-2 of Oncology ...   
8  W4310782751  THE RELATIONSHIP OF CLINICAL AND MORPHOLOGICAL...   
9  W1592839950  The health of the school child? An historical ...   

                                    referenced_works  \
0                                                 []   
1  [Scalable Protocols for Authenticated Group Ke...   
2  [Modeling theory applied: Modeling Instruction...   
3               

#### Checks it is in proper form

In [None]:
wrangled_work_df.head()

Unnamed: 0,work_id,title,referenced_works,related_work_titles,ngrams,nearest_related_work,nearest_referenced_work,nearest_ngrams_work
0,W4205227400,"SunTrust Banks, Inc.",[],"[Regulation, Deregulation, Reregulation: The F...",[],,,
1,W2015602200,A constant rounds group key agreement protocol...,[Scalable Protocols for Authenticated Group Ke...,"[Authentication for distributed systems, Resea...","[1, key, ]",,,
2,W2952852381,Equity investigation of attitudinal shifts in ...,[Modeling theory applied: Modeling Instruction...,[Anticipating the inevitable: When leader and ...,[],,,
3,W2904069587,Chiffres et lettres,[],[],[],,,
4,W2345552705,[Chemical control of rust (Uromyces phaseoli v...,[],[],[],,,


In [None]:
len(wrangled_work_df["referenced_works"][1])

3

In [None]:
len(wrangled_work_df["related_work_titles"][1])

3

In [None]:
len(wrangled_work_df["ngrams"][1])

3

#### Store the values in Sqlite3 Database

In [None]:
import json
import sqlite3

# Convert list columns to JSON strings
wrangled_work_df['referenced_works'] = wrangled_work_df['referenced_works'].apply(json.dumps)
wrangled_work_df['related_work_titles'] = wrangled_work_df['related_work_titles'].apply(json.dumps)
wrangled_work_df['ngrams'] = wrangled_work_df['ngrams'].apply(json.dumps)

wrangled_work_df['nearest_related_work'] = wrangled_work_df['nearest_related_work'].apply(json.dumps)
wrangled_work_df['nearest_referenced_work'] = wrangled_work_df['nearest_referenced_work'].apply(json.dumps)
wrangled_work_df['nearest_ngrams_work'] = wrangled_work_df['nearest_ngrams_work'].apply(json.dumps)



# Connect to SQLite
works_db = sqlite3.connect("works.db")

# Convert DataFrame to SQLite
wrangled_work_df.to_sql('works_table', works_db, index=False, if_exists='replace')

# Save (commit) the changes
works_db.commit()

# Close the connection
works_db.close()

#### Fetch again

In [None]:
import sqlite3
import json

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query
query = "SELECT * FROM works_table"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Iterate over the results
for result in results:
    (work_id,
     title,
     referenced_works_json,
     related_work_titles_json,
     ngrams_json,
     nearest_referenced_work_json,
     nearest_related_work_json,
     nearest_ngrams_work_json)= result

    # Parse JSON strings back to Python objects
    referenced_works = json.loads(referenced_works_json)
    related_work_titles = json.loads(related_work_titles_json)
    ngrams = json.loads(ngrams_json)
    nearest_referenced_work = json.loads(nearest_referenced_work_json)
    nearest_related_work = json.loads(nearest_related_work_json)
    nearest_ngrams_work = json.loads(nearest_ngrams_work_json)

    # Now you have the data as lists of strings
    print("Work ID:", work_id)
    print("Title:", title)
    print("Referenced Works:", referenced_works)
    print("Related Work Titles:", related_work_titles)
    print("Ngrams:", ngrams)
    print("Nearest Referenced Work Titles:", nearest_referenced_work)
    print("Nearest Related Work Titles:", nearest_related_work)
    print("Nearest Ngrams:", nearest_ngrams_work)
    print("\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

Work ID: W4205227400
Title: SunTrust Banks, Inc.
Referenced Works: []
Related Work Titles: ['Regulation, Deregulation, Reregulation: The Future of the Banking, Insurance, and Securities Industries', 'Participation of Investment Banks and Non-Bank Financial Institutions in Syndicated Loans', 'Privacy Notices under the Gramm—Leach—Bliley Act']
Ngrams: []
Nearest Referenced Work Titles: nan
Nearest Related Work Titles: nan
Nearest Ngrams: nan


Work ID: W2015602200
Title: A constant rounds group key agreement protocol without using hash functions
Referenced Works: ['Scalable Protocols for Authenticated Group Key Exchange', 'Authenticated key agreement without using one-way hash functions', 'Remarks on unknown key-share attack on authenticated multiple-key agreement protocol']
Related Work Titles: ['Authentication for distributed systems', 'Research of AAA messages Based on 802.1x authentication', 'A Sidechain-Based Decentralized Authentication Scheme via Optimized Two-Way Peg Protocol for

### Use the Voyage API to cluster and find Top-k values

In [None]:
!pip install voyageai

Collecting voyageai
  Downloading voyageai-0.1.7-py3-none-any.whl (25 kB)
Collecting aiolimiter<2.0.0,>=1.1.0 (from voyageai)
  Downloading aiolimiter-1.1.0-py3-none-any.whl (7.2 kB)
Installing collected packages: aiolimiter, voyageai
Successfully installed aiolimiter-1.1.0 voyageai-0.1.7


#### Set the environment

In [None]:
import os
import voyageai
os.environ['VOYAGE_API_KEY'] = "pa-k2_wb1Mj37_Ppl1FFBmCvu-ybdIKZzels0GeMF7PnUI"
vo = voyageai.Client(api_key=os.environ.get("VOYAGE_API_KEY"),)

#### Set the Document values [This is an example usage]

In [None]:
import sqlite3
import json

# Fetch the sample documents
related_documents = []

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query
query = "SELECT * FROM works_table LIMIT 1"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Iterate over the results
for result in results:
    work_id, title, referenced_works_json, related_work_titles_json, ngrams_json = result

    # Parse JSON strings back to Python objects
    referenced_works = json.loads(referenced_works_json)
    related_work_titles = json.loads(related_work_titles_json)
    ngrams = json.loads(ngrams_json)

    # store the related documents
    related_documents = related_work_titles

    # Now you have the data as lists of strings
    print("Work ID:", work_id)
    print("Title:", title)
    print("Referenced Works:", referenced_works)
    print("Related Work Titles:", related_work_titles)
    print("Ngrams:", ngrams)
    print("\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

Work ID: W4205227400
Title: SunTrust Banks, Inc.
Referenced Works: []
Related Work Titles: ['Regulation, Deregulation, Reregulation: The Future of the Banking, Insurance, and Securities Industries', 'Participation of Investment Banks and Non-Bank Financial Institutions in Syndicated Loans', 'Privacy Notices under the Gramm—Leach—Bliley Act']
Ngrams: []




#### This model version performs better than other model versions specifically fine-tuned for clustering problems

In [None]:
# Embed the documents
related_documents_embeddings = vo.embed(related_documents, model="voyage-lite-02-instruct", input_type="document").embeddings

#### To find out the document that is most similar to the query among the existing data, we can first embed/vectorize the query:

In [None]:
# Get the embedding of the query in our case title
title_embedding = vo.embed([title], model="voyage-lite-02-instruct", input_type="query").embeddings[0]

**Nearest neighbor Search:** We can find a few closest embeddings in the documents embeddings based on the cosine similarity, and retrieve the corresponding document using the nearest_neighbors function.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def k_nearest_neighbors(query_embedding, documents_embeddings, k=5):
  query_embedding = np.array(query_embedding) # convert to numpy array
  documents_embeddings = np.array(documents_embeddings) # convert to numpy array

  # Reshape the query vector embedding to a matrix of shape (1, n) to make it compatible with cosine_similarity
  query_embedding = query_embedding.reshape(1, -1)

  # Calculate the similarity for each item in data
  cosine_sim = cosine_similarity(query_embedding, documents_embeddings)

  # Sort the data by similarity in descending order and take the top k items
  sorted_indices = np.argsort(cosine_sim[0])[::-1]

  # Take the top k related embeddings
  top_k_related_indices = sorted_indices[:k]
  top_k_related_embeddings = documents_embeddings[sorted_indices[:k]]
  top_k_related_embeddings = [list(row[:]) for row in top_k_related_embeddings] # convert to list

  return top_k_related_embeddings, top_k_related_indices

#### **$k$-nearest neighbors Search ($k$-NN):** It is often useful to retrieve not only the closest document but also the $k$ most closest documents. The k_nearest_neighbors algorithm enables us to achieve this. It is important to note that `nearest_neighbors` is special case of `k_nearest_neighbors` when $k=1$.

In [None]:
# Use the k-nearest neighbor algorithm to identify the top-k documents with the highest similarity
retrieved_embds, retrieved_embd_indices = k_nearest_neighbors(title_embedding, related_documents_embeddings, k=1)
retrieved_docs = [related_documents[index] for index in retrieved_embd_indices]

print(retrieved_docs)

['Participation of Investment Banks and Non-Bank Financial Institutions in Syndicated Loans']


#### **Expand** this code to allow multiple values

In [None]:
import sqlite3
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query
query = "SELECT * FROM works_table"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Iterate over the results
for result in results:
    (work_id,
     title,
     referenced_works_json,
     related_work_titles_json,
     ngrams_json,
     nearest_referenced_work_json,
     nearest_related_work_json,
     nearest_ngrams_work_json) = result

    # Parse JSON strings back to Python objects
    referenced_works = json.loads(referenced_works_json)
    related_work_titles = json.loads(related_work_titles_json)
    ngrams = json.loads(ngrams_json)

    # Lists to store retrieved documents
    nearest_referenced_work = ""
    nearest_related_work = ""
    nearest_ngrams_work = ""


    # Embed the title
    title_embedding = vo.embed([title], model="voyage-lite-02-instruct", input_type="query").embeddings[0]

    # Store the referenced_works for processing
    if referenced_works:
      # Store the referenced for processing
      referenced_documents = referenced_works

      # Embed the referenced
      referenced_documents_embeddings = vo.embed(referenced_documents, model="voyage-lite-02-instruct", input_type="document").embeddings

      # Find the nearest neighbor and store in the nearest referenced work
      retrieved_embds, retrieved_embd_indices = k_nearest_neighbors(title_embedding, referenced_documents_embeddings, k=1)
      nearest_referenced_work = referenced_documents[retrieved_embd_indices[0]]

    if related_work_titles:
      # Store the related_works for processing
      related_documents = related_work_titles

      # Embed the related_works
      related_documents_embeddings = vo.embed(related_documents, model="voyage-lite-02-instruct", input_type="document").embeddings

      # Find the nearest neighbor and store in the nearest related work
      retrieved_embds, retrieved_embd_indices = k_nearest_neighbors(title_embedding, related_documents_embeddings, k=1)
      nearest_related_work = related_documents[retrieved_embd_indices[0]]

    # Embed the ngrams if the list has values
    if ngrams:
      # Store the ngrams_works for processing
      ngrams_documents = ngrams

      # Embed the ngrams_works
      ngrams_documents_embeddings = vo.embed(ngrams_documents, model="voyage-lite-02-instruct", input_type="document").embeddings

      # Find the nearest neighbor and store in the nearest ngram work
      retrieved_embds, retrieved_embd_indices = k_nearest_neighbors(title_embedding, ngrams_documents_embeddings, k=1)
      nearest_ngrams_work = ngrams_documents[retrieved_embd_indices[0]]

    # Update the SQLite database with the retrieved documents
    update_query = f"UPDATE works_table SET nearest_related_work = ?, nearest_referenced_work = ?, nearest_ngrams_work = ? WHERE work_id = ?"
    work_cursor.execute(update_query, (nearest_related_work, nearest_referenced_work, nearest_ngrams_work, work_id))
    works_db.commit()


# Close the cursor and connection
work_cursor.close()
works_db.close()

In [None]:
# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query (using this as an example because it has all the values)
query = "SELECT * FROM works_table WHERE work_id like 'W2015602200'"
work_cursor.execute(query)
# Fetch all results
results = work_cursor.fetchall()
results

[('W2015602200',
  'A constant rounds group key agreement protocol without using hash functions',
  '["Scalable Protocols for Authenticated Group Key Exchange", "Authenticated key agreement without using one-way hash functions", "Remarks on unknown key-share attack on authenticated multiple-key agreement protocol"]',
  '["Authentication for distributed systems", "Research of AAA messages Based on 802.1x authentication", "A Sidechain-Based Decentralized Authentication Scheme via Optimized Two-Way Peg Protocol for Smart Community"]',
  '["1", "key", "\\uf8ef"]',
  'A Sidechain-Based Decentralized Authentication Scheme via Optimized Two-Way Peg Protocol for Smart Community',
  'Scalable Protocols for Authenticated Group Key Exchange',
  'key')]

#### **Make a training pair** using the column values from the table

#### Title Refereced Work JSON

In [None]:
import json
import sqlite3

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query only the rows having value and limiting the number to 10k
query = "SELECT * FROM works_table WHERE length(nearest_referenced_work) > 0 LIMIT 5"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Define the path for the JSONL file
jsonl_file_path = "title_referenced_work.jsonl"

# Open the JSONL file in write mode
with open(jsonl_file_path, "w") as jsonl_file:
    # Iterate over the results
    for result in results:
        work_id, title, referenced_works_json, related_work_titles_json, ngrams_json, \
        nearest_related_work, nearest_referenced_work, nearest_ngrams_work = result

        # Extract values for JSONL format
        anchor_text = title
        positive_text = nearest_referenced_work

        # Create a dictionary for positive pairs
        positive_pairs = {"texts": [anchor_text, positive_text]}

        # Write the positive pairs dictionary as a JSON string to the JSONL file
        jsonl_file.write(json.dumps(positive_pairs) + "\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

#### Title Related Work JSON

In [None]:
import json
import sqlite3

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query only the rows having value and limiting the number to 10k
query = "SELECT * FROM works_table WHERE length(nearest_related_work) > 0 LIMIT 5"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Define the path for the JSONL file
jsonl_file_path = "title_related_work.jsonl"

# Open the JSONL file in write mode
with open(jsonl_file_path, "w") as jsonl_file:
    # Iterate over the results
    for result in results:
        work_id, title, referenced_works_json, related_work_titles_json, ngrams_json, \
        nearest_related_work, nearest_referenced_work, nearest_ngrams_work = result

        # Extract values for JSONL format
        anchor_text = title
        positive_text = nearest_related_work

        # Create a dictionary for positive pairs
        positive_pairs = {"texts": [anchor_text, positive_text]}

        # Write the positive pairs dictionary as a JSON string to the JSONL file
        jsonl_file.write(json.dumps(positive_pairs) + "\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

#### Title Ngrams Work JSON

In [None]:
import json
import sqlite3

# Connect to the SQLite database
works_db = sqlite3.connect("works.db")

# Create a cursor
work_cursor = works_db.cursor()

# Execute a SELECT query only the rows having value and limiting the number to 10k
query = "SELECT * FROM works_table WHERE length(nearest_ngrams_work) > 0 LIMIT 5"
work_cursor.execute(query)

# Fetch all results
results = work_cursor.fetchall()

# Define the path for the JSONL file
jsonl_file_path = "title_ngrams_work.jsonl"

# Open the JSONL file in write mode
with open(jsonl_file_path, "w") as jsonl_file:
    # Iterate over the results
    for result in results:
        work_id, title, referenced_works_json, related_work_titles_json, ngrams_json, \
        nearest_related_work, nearest_referenced_work, nearest_ngrams_work = result

        # Extract values for JSONL format
        anchor_text = title
        positive_text = nearest_ngrams_work

        # Create a dictionary for positive pairs
        positive_pairs = {"texts": [anchor_text, positive_text]}

        # Write the positive pairs dictionary as a JSON string to the JSONL file
        jsonl_file.write(json.dumps(positive_pairs) + "\n")

# Close the cursor and connection
work_cursor.close()
works_db.close()

#### 4. Triplets from Topics

#### Keywords Topic

In [24]:
import requests
import json
import time

def fetch_and_store_data(url, output_file, target_count=1,delay=1):
    # Initialize counters
    total_keywords_count = 0
    total_entries_count = 0

    # Loop until the target count is reached
    while total_entries_count < target_count:
        # Fetch data from the URL
        response = requests.get(url)
        data = response.json()

        # Extract relevant information and store in the desired format
        results = data.get("results", [])
        formatted_data = []

        for result in results:
            display_name = result.get("display_name", "")
            keywords = result.get("keywords", [])

            if keywords:
                # Extracting the top-scored keyword text
                top_keyword = keywords[0]

                # Creating the structure {"texts": ["display_name", "key"]}
                formatted_entry = {"texts": [display_name, top_keyword]}
                formatted_data.append(formatted_entry)
                total_keywords_count += 1

        # Write to a .jsonl file
        with open(output_file, "a") as jsonl_file:
            for entry in formatted_data:
                jsonl_file.write(json.dumps(entry) + "\n")
                total_entries_count += 1

        # Delay for 1 second before the next API call due to rate limit
        time.sleep(delay)

    print(f"Total entries with keywords: {total_keywords_count}")
    print(f"Total entries written to {output_file}: {total_entries_count}")

if __name__ == "__main__":
    # Specify the URL and output file
    api_url = "https://api.openalex.org/topics?mailto=brookshum24@gmail.com&sample=1&seed=3&select=display_name,keywords"
    output_jsonl_file = "topic_name_key.jsonl"

    # Fetch and store the data
    fetch_and_store_data(api_url, output_jsonl_file)

Total entries with keywords: 1
Total entries written to topic_name_key.jsonl: 1


#### 5. Triplet display name , domain , field  [Hard-negative]
#### This applies to the display name , domain , sub-field  [soft-negative]
#### This also applies to the display name , field, sub-field [Hard-negative]

In [31]:
import requests
import json
import time

def fetch_and_store_topics(url, output_file, target_count=1, delay=1):
    # Initialize counters
    total_entries_count = 0

    # Loop until the target count is reached
    while total_entries_count < target_count:
        # Fetch data from the URL
        response = requests.get(url)
        data = response.json()

        # Extract relevant information and store in the desired format
        results = data.get("results", [])
        formatted_data = []

        for result in results:
            display_name = result.get("display_name", "")
            domain = result.get("domain", {}).get("display_name", "")
            field = result.get("field", {}).get("display_name", "")

            # Create the triplet {"texts": ["display_name", "domain", "field"]}
            formatted_entry = {"texts": [display_name, domain, field]}
            formatted_data.append(formatted_entry)
            total_entries_count += 1

        # Write to a .jsonl file
        with open(output_file, "a") as jsonl_file:
            for entry in formatted_data:
                jsonl_file.write(json.dumps(entry) + "\n")

        # Delay for 1 second before the next API call due to rate limit
        time.sleep(delay)

    print(f"Total entries written to {output_file}: {total_entries_count}")

if __name__ == "__main__":
    # Specify the URL and output file
    topics_url = "https://api.openalex.org/topics?mailto=brookshum24@gmail.com&sample=2&seed=3&select=display_name,domain,field"
    output_jsonl_file = "topics_triplets.jsonl"

    # Fetch and store the topics data
    fetch_and_store_topics(topics_url, output_jsonl_file)

Total entries written to topics_triplets.jsonl: 2
