### Purpose: Test parts of pipeline

In [1]:
import json
import sys
import os

root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(root_path)

from data_pipeline.generate_embeddings import E5Embedder

In [None]:
# Load text (extracted from Deepseek's recent paper)
deepseek_text_fp = "/Users/blake/Documents/Projects/summarize_research_papers/papers/arxiv/images/extracted_data.json"

with open(deepseek_text_fp, "r") as f:
    data = json.load(f)

data

{'pdf_path': '/Users/blake/Documents/Projects/summarize_research_papers/papers/arxiv/2501.12948v1.pdf',
 'text_data': {'0': 'DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via\nReinforcement Learning\nDeepSeek-AI\nresearch@deepseek.com\nAbstract\nWe introduce our ﬁrst-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.\nDeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super-\nvised ﬁne-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.\nThrough RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing\nreasoning behaviors. However, it encounters challenges such as poor readability, and language\nmixing. To address these issues and further enhance reasoning performance, we introduce\nDeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-\nR1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the\nresearch commun

In [25]:
# Get embeddings
e5 = E5Embedder()
embeddings = e5.generate_embeddings(list(data["text_data"].values()))
print(embeddings)
    

[[ 0.28550515  0.01040937 -0.21170326 ...  0.18029709  0.07658855
   0.13866262]
 [ 0.2935927  -0.05024432 -0.21362959 ...  0.2454749   0.06477783
   0.15788874]
 [ 0.2417541   0.03685652 -0.17801604 ...  0.27879405  0.02790905
   0.15195541]
 ...
 [ 0.15512833 -0.07143532 -0.30900222 ...  0.22385035  0.08397662
   0.2106431 ]
 [ 0.11701664 -0.05376775 -0.24720721 ...  0.2236444   0.17184582
   0.20218661]
 [ 0.1606614  -0.02519011 -0.24392593 ...  0.166118    0.18222454
   0.1462169 ]]


In [None]:
# Generate embedding of query and search FAISS
query = "Reinforcement Learning Chain of Thought Deepseek"
query_embedding = e5.generate_embeddings([query])
print(query_embedding)

# Search in FAISS

[[ 0.2928947   0.0078266  -0.17392892 -0.33199435  0.28465515 -0.15076882
   0.11382081  0.0579555   0.23287156 -0.05521818  0.14373419 -0.03033742
   0.28324676 -0.21402188 -0.394824   -0.11848263  0.17341025 -0.20846198
  -0.25597605 -0.23322757  0.04407349 -0.07481042 -0.10101436  0.206763
   0.18820584  0.10913341 -0.08912351 -0.01380628  0.2906241  -0.24987999
  -0.21810275 -0.2302648   0.2016752  -0.20395006  0.17988122  1.0022283
  -0.18501106 -0.26269692  0.16973107 -0.21673301 -0.03534617  0.09275564
   0.14980984  0.27024847  0.17692377  0.12005933 -0.07979742  0.19265644
  -0.1915885  -0.05561496 -0.19166857  0.21874554  0.06757709  0.23238523
   0.13711219 -0.29882234 -0.1325091  -0.23949689 -0.21135034  0.26059073
   0.28246844 -0.08815054  0.16450952 -0.07922731  0.36415103  0.22903472
   0.14558928  0.08793404 -0.09135187 -0.09756848 -0.09408723  0.12249765
  -0.0520447  -0.16006279 -0.00378343  0.12734152  0.13695796 -0.08629481
   0.09294841 -0.13068916 -0.26015753 -0.