## Match URI with MD dir

In [1]:
data = {
    "dir": [
        "docs/README.md",
        "docs/lp-finance-protocol/quick-user-guide.md",
        "docs/lp-finance-protocol/view-protocol-data.md",
        "docs/lp-finance-protocol/user-faq.md",
        "docs/protocol/minting-borrowing-zsol.md",
        "docs/protocol/zsol-sol-liquidity-providers.md",
        "docs/protocol/peg-stability-module.md",
        "docs/protocol/liquidation.md",
        "docs/protocol/oracles.md",
        "docs/lpfi-staking/lpfi-staking.md",
        "docs/governance/governance.md",
        "docs/addresses/programs.md",
        "docs/addresses/tokens.md",
        "docs/links/links.md"
    ],
    "uri": [
        "https://docs.lp.finance/",
        "https://docs.lp.finance/lp-finance-protocol/quick-user-guide",
        "https://docs.lp.finance/lp-finance-protocol/view-protocol-data",
        "https://docs.lp.finance/lp-finance-protocol/user-faq",
        "https://docs.lp.finance/protocol/minting-borrowing-zsol",
        "https://docs.lp.finance/protocol/zsol-sol-liquidity-providers",
        "https://docs.lp.finance/protocol/peg-stability-module",
        "https://docs.lp.finance/protocol/liquidation",
        "https://docs.lp.finance/protocol/oracles",
        "https://docs.lp.finance/lpfi-staking/lpfi-staking",
        "https://docs.lp.finance/governance/governance",
        "https://docs.lp.finance/addresses/programs",
        "https://docs.lp.finance/addresses/tokens",
        "https://docs.lp.finance/links/links"
    ]
}

## Get Embeddings

In [13]:
import openai
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()
openai.api_key=os.environ.get("OPENAI_KEY")

In [14]:
def get_embedding(text):
    embedding = openai.Embedding.create(
        input=text, model="text-embedding-ada-002"
    )["data"][0]["embedding"]
    return embedding

In [15]:
text_array = []
for i in data["dir"]:
    with open(i, "r") as f:
        text = f.read()
        text_array.append(text)
        f.close()

df = pd.DataFrame.from_dict({"name": data["dir"], "url": data["uri"], "text": text_array})

In [16]:
df["embedding"] = df.text.apply(lambda x: get_embedding(x))
df

Unnamed: 0,name,url,text,embedding
0,docs/README.md,https://docs.lp.finance/,"---\ndescription: >-\n In this document, the ...","[0.0030690261628478765, 0.014892998151481152, ..."
1,docs/lp-finance-protocol/quick-user-guide.md,https://docs.lp.finance/lp-finance-protocol/qu...,# Quick User Guide\n\n## Deposit Collateral\n\...,"[0.009227563627064228, 0.02060684561729431, 0...."
2,docs/lp-finance-protocol/view-protocol-data.md,https://docs.lp.finance/lp-finance-protocol/vi...,"# View Protocol Data\n\n<figure><img src=""../....","[0.006939202547073364, 0.016189193353056908, 0..."
3,docs/lp-finance-protocol/user-faq.md,https://docs.lp.finance/lp-finance-protocol/us...,"# User FAQ\n\n### Why is ""Your Account"" page n...","[-0.009891600348055363, 0.009116998873651028, ..."
4,docs/protocol/minting-borrowing-zsol.md,https://docs.lp.finance/protocol/minting-borro...,# Minting/Borrowing zSOL\n\nzSOL is a over col...,"[0.002876314101740718, 0.02002980187535286, 0...."
5,docs/protocol/zsol-sol-liquidity-providers.md,https://docs.lp.finance/protocol/zsol-sol-liqu...,# zSOL-SOL Liquidity Providers\n\nzSOL-SOL liq...,"[0.01576395332813263, 0.019564801827073097, 0...."
6,docs/protocol/peg-stability-module.md,https://docs.lp.finance/protocol/peg-stability...,# Peg-stability Module\n\nPeg-stability module...,"[0.009073340333998203, 0.014598997309803963, -..."
7,docs/protocol/liquidation.md,https://docs.lp.finance/protocol/liquidation,"# Liquidation\n\nOn LP Finance, there are two ...","[0.006852426566183567, 0.0092744966968894, 0.0..."
8,docs/protocol/oracles.md,https://docs.lp.finance/protocol/oracles,# Oracles\n\n* **SOL**: [https://switchboard.x...,"[0.015092517249286175, -0.009464324451982975, ..."
9,docs/lpfi-staking/lpfi-staking.md,https://docs.lp.finance/lpfi-staking/lpfi-staking,# LPFi Staking\n\nRevenue genereated by protoc...,"[0.009969319216907024, 0.0034121794160455465, ..."


## Check Similarity

In [17]:
import torch

def get_similarity(question, url_array, embedding_array):
    similarity_array = []
    for i in range(len(embedding_array)):
        question_embedding = openai.Embedding.create(
            input=question, model="text-embedding-ada-002"
        )["data"][0]["embedding"]
        cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
        similarity = cos(torch.FloatTensor(question_embedding), torch.FloatTensor(embedding_array[i]))
        similarity_array.append(similarity)
        
    max_similarity = max(similarity_array)
    idx = similarity_array.index(max_similarity)
    print(similarity_array)
    
    return url_array[idx]

In [18]:
get_similarity("How does PSM work?",df["url"].tolist(), df["embedding"].tolist())

[tensor(0.6961), tensor(0.6848), tensor(0.6789), tensor(0.6718), tensor(0.6759), tensor(0.7024), tensor(0.8024), tensor(0.6765), tensor(0.6769), tensor(0.6867), tensor(0.6725), tensor(0.6815), tensor(0.6704), tensor(0.6571)]


'https://docs.lp.finance/protocol/peg-stability-module'