In [16]:
%pip install --upgrade langchain deeplake openai tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp38-cp38-win_amd64.whl (635 kB)
     -------------------------------------- 635.3/635.3 kB 1.3 MB/s eta 0:00:00
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2023.6.3-cp38-cp38-win_amd64.whl (268 kB)
     -------------------------------------- 268.1/268.1 kB 1.5 MB/s eta 0:00:00
Installing collected packages: regex, tiktoken
Successfully installed regex-2023.6.3 tiktoken-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [24]:
import os
from getpass import getpass
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

In [25]:

root_dir = "../datasets/contracts"

docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith(".txt") and "/.venv/" not in dirpath:
            try:
                loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
                docs.extend(loader.load_and_split())
            except Exception as e:
                pass
print(f"{len(docs)}")

1853


In [26]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
print(f"{len(texts)}")

Created a chunk of size 1453, which is longer than the specified 1000
Created a chunk of size 1212, which is longer than the specified 1000
Created a chunk of size 1947, which is longer than the specified 1000
Created a chunk of size 1550, which is longer than the specified 1000
Created a chunk of size 3175, which is longer than the specified 1000
Created a chunk of size 1029, which is longer than the specified 1000
Created a chunk of size 1062, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000
Created a chunk of size 1078, which is longer than the specified 1000
Created a chunk of size 1045, which is longer than the specified 1000
Created a chunk of size 3175, which is longer than the specified 1000
Created a chunk of size 1029, which is longer than the specified 1000
Created a chunk of size 1062, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000
Created a chunk of s

6628


In [27]:
embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None, tiktoken_model_name=None, show_progress_bar=False)

In [28]:
from langchain.vectorstores import DeepLake

db = DeepLake.from_documents(
    texts, embeddings, dataset_path=f"hub://commanderastern/polka-code-2"
)
db

Your Deep Lake dataset has been successfully created!


 

Dataset(path='hub://commanderastern/polka-code-2', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
 embedding  embedding  (6628, 1536)  float32   None   
    id        text      (6628, 1)      str     None   
 metadata     json      (6628, 1)      str     None   
   text       text      (6628, 1)      str     None   


<langchain.vectorstores.deeplake.DeepLake at 0x22d87b004f0>

In [29]:
db = DeepLake(
    dataset_path=f"hub://commanderastern/polka-code-2",
    read_only=True,
    embedding_function=embeddings,
)

Deep Lake Dataset in hub://commanderastern/polka-code-2 already exists, loading from the storage


In [32]:
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 20
retriever.search_kwargs["maximal_marginal_relevance"] = True
retriever.search_kwargs["k"] = 20

In [30]:
def filter(x):
    # filter based on source code
    if "something" in x["text"].data()["value"]:
        return False

    # filter based on path e.g. extension
    metadata = x["metadata"].data()["value"]
    return "only_this" in metadata["source"] or "also_that" in metadata["source"]

In [35]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = ChatOpenAI(model_name="gpt-3.5-turbo-16k")  # 'ada' 'gpt-3.5-turbo' 'gpt-4',
qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)

In [36]:
questions = [
    "Give me a basic ink contract code",
]
chat_history = []

for question in questions:
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: Give me a basic ink contract code 

**Answer**: Certainly! Here's a basic example of an ink contract code:

```rust
#![cfg_attr(not(feature = "std"), no_std)]

use ink_lang as ink;

#[ink::contract]
mod basic_contract {
    use ink_storage::collections::HashMap;
    use ink_storage::traits::PackedLayout;

    #[ink(storage)]
    struct BasicContract {
        values: HashMap<AccountId, u32>,
    }

    impl BasicContract {
        #[ink(constructor)]
        fn new() -> Self {
            Self {
                values: HashMap::new(),
            }
        }

        #[ink(message)]
        fn set_value(&mut self, value: u32) {
            let caller = self.env().caller();
            self.values.insert(caller, value);
        }

        #[ink(message)]
        fn get_value(&self) -> Option<u32> {
            let caller = self.env().caller();
            self.values.get(&caller).copied()
        }
    }
}
```

In this example, we have a basic contract called `BasicCont