In [1]:
%pip install langchain deeplake openai tiktoken

Collecting deeplake
  Using cached deeplake-3.6.12.tar.gz (527 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-win_amd64.whl (635 kB)
     -------------------------------------- 635.3/635.3 kB 2.9 MB/s eta 0:00:00
Collecting boto3
  Downloading boto3-1.28.9-py3-none-any.whl (135 kB)
     -------------------------------------- 135.7/135.7 kB 4.0 MB/s eta 0:00:00
Collecting pathos
  Downloading pathos-0.3.1-py3-none-any.whl (82 kB)
     ---------------------------------------- 82.1/82.1 kB 4.8 MB/s eta 0:00:00
Collecting humbug>=0.3.1
  Using cached humbug-0.3.2-py3-none-any.whl (15 kB)
Collecting numcodecs
  Downloading numcodecs-0.11.0-cp310-cp310-win_amd64.whl (604 kB)
     -------------------------------------- 604.5/604.5 kB 2.9 MB/s eta 0:00:00
Collecting s3transfer<0.7.0,>=0.6.0
  Using cached s3transfer-0.6.1-py3-none-any.whl (79 kB)
Collecting jmespath<2.0.0,>=0

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.7.1 requires dill<0.3.7, but you have dill 0.3.7 which is incompatible.

[notice] A new release of pip available: 22.3.1 -> 23.2
[notice] To update, run: pythonw.exe -m pip install --upgrade pip


In [1]:
!pip install langchain==0.0.229

Collecting langchain==0.0.229
  Using cached langchain-0.0.229-py3-none-any.whl (1.3 MB)
Installing collected packages: langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 0.0.240
    Uninstalling langchain-0.0.240:
      Successfully uninstalled langchain-0.0.240
Successfully installed langchain-0.0.229


In [7]:
import os
from getpass import getpass
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

In [8]:

root_dir = "../datasets/docs/"

docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith(".txt") and "/.venv/" not in dirpath:
            try:
                loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
                docs.extend(loader.load_and_split())
            except Exception as e:
                pass
print(f"{len(docs)}")

754


In [9]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
print(f"{len(texts)}")

754


In [10]:
embeddings = OpenAIEmbeddings()


In [11]:
from langchain.vectorstores import DeepLake

db = DeepLake.from_documents(
    texts, embedding=embeddings, dataset_path=f"hub://commanderastern/polka-docs"
)
db

Your Deep Lake dataset has been successfully created!


/

Dataset(path='hub://commanderastern/polka-docs', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
 embedding  embedding  (754, 1536)  float32   None   
    id        text      (754, 1)      str     None   
 metadata     json      (754, 1)      str     None   
   text       text      (754, 1)      str     None   


 

<langchain.vectorstores.deeplake.DeepLake at 0x26f75cf9b40>

In [12]:
db = DeepLake(
    dataset_path=f"hub://commanderastern/polka-docs",
    read_only=True,
    embedding_function=embeddings,
)

Deep Lake Dataset in hub://commanderastern/polka-docs already exists, loading from the storage


In [7]:
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 20
retriever.search_kwargs["maximal_marginal_relevance"] = True
retriever.search_kwargs["k"] = 20

In [8]:
def filter(x):
    # filter based on source code
    if "something" in x["text"].data()["value"]:
        return False

    # filter based on path e.g. extension
    metadata = x["metadata"].data()["value"]
    return "only_this" in metadata["source"] or "also_that" in metadata["source"]

In [9]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = ChatOpenAI(model_name="gpt-3.5-turbo-16k")  # 'ada' 'gpt-3.5-turbo' 'gpt-4',
qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)

In [10]:
questions = [
    "A code that store value 1 and 2 and return the sum of the two values",
]
chat_history = []

for question in questions:
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: A code that store value 1 and 2 and return the sum of the two values 

**Answer**: Here's an example code that stores the values 1 and 2 and returns their sum:

```rust
use ink_lang as ink;

#[ink::contract]
mod value_store {
    #[ink(storage)]
    pub struct ValueStore {
        value1: i32,
        value2: i32,
    }

    impl ValueStore {
        #[ink(constructor)]
        pub fn new(value1: i32, value2: i32) -> Self {
            Self { value1, value2 }
        }

        #[ink(message)]
        pub fn get_sum(&self) -> i32 {
            self.value1 + self.value2
        }
    }
}
```

In this code, the `ValueStore` contract has two storage variables `value1` and `value2` of type `i32`. The constructor `new` is used to initialize these values. The `get_sum` message returns the sum of `value1` and `value2`. 

