In [16]:
%pip install --upgrade langchain deeplake openai tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp38-cp38-win_amd64.whl (635 kB)
     -------------------------------------- 635.3/635.3 kB 1.3 MB/s eta 0:00:00
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2023.6.3-cp38-cp38-win_amd64.whl (268 kB)
     -------------------------------------- 268.1/268.1 kB 1.5 MB/s eta 0:00:00
Installing collected packages: regex, tiktoken
Successfully installed regex-2023.6.3 tiktoken-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from getpass import getpass
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain



In [2]:

root_dir = "../datasets/contracts2"

docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith(".txt") and "/.venv/" not in dirpath:
            try:
                loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
                docs.extend(loader.load_and_split())
            except Exception as e:
                pass
print(f"{len(docs)}")

20


In [3]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
print(f"{len(texts)}")

Created a chunk of size 1212, which is longer than the specified 1000
Created a chunk of size 1551, which is longer than the specified 1000
Created a chunk of size 1001, which is longer than the specified 1000
Created a chunk of size 1260, which is longer than the specified 1000
Created a chunk of size 1233, which is longer than the specified 1000
Created a chunk of size 1285, which is longer than the specified 1000


82


In [4]:
embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None, tiktoken_model_name=None, show_progress_bar=False)

In [5]:
from langchain.vectorstores import DeepLake

db = DeepLake.from_documents(
    texts, embeddings, dataset_path=f"hub://commanderastern/polka-code-3"
)
db

Your Deep Lake dataset has been successfully created!


 

Dataset(path='hub://commanderastern/polka-code-3', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (82, 1536)  float32   None   
    id        text      (82, 1)      str     None   
 metadata     json      (82, 1)      str     None   
   text       text      (82, 1)      str     None   


<langchain.vectorstores.deeplake.DeepLake at 0x28abd012e20>

In [6]:
db = DeepLake(
    dataset_path=f"hub://commanderastern/polka-code-3",
    read_only=True,
    embedding_function=embeddings,
)

Deep Lake Dataset in hub://commanderastern/polka-code-3 already exists, loading from the storage


In [7]:
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 20
retriever.search_kwargs["maximal_marginal_relevance"] = True
retriever.search_kwargs["k"] = 20

In [8]:
def filter(x):
    # filter based on source code
    if "something" in x["text"].data()["value"]:
        return False

    # filter based on path e.g. extension
    metadata = x["metadata"].data()["value"]
    return "only_this" in metadata["source"] or "also_that" in metadata["source"]

In [9]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = ChatOpenAI(model_name="gpt-3.5-turbo-16k")  # 'ada' 'gpt-3.5-turbo' 'gpt-4',
qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)

In [10]:
questions = [
    "A code that store value 1 and 2 and return the sum of the two values",
]
chat_history = []

for question in questions:
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: A code that store value 1 and 2 and return the sum of the two values 

**Answer**: Here's an example code that stores the values 1 and 2 and returns their sum:

```rust
use ink_lang as ink;

#[ink::contract]
mod value_store {
    #[ink(storage)]
    pub struct ValueStore {
        value1: i32,
        value2: i32,
    }

    impl ValueStore {
        #[ink(constructor)]
        pub fn new(value1: i32, value2: i32) -> Self {
            Self { value1, value2 }
        }

        #[ink(message)]
        pub fn get_sum(&self) -> i32 {
            self.value1 + self.value2
        }
    }
}
```

In this code, the `ValueStore` contract has two storage variables `value1` and `value2` of type `i32`. The constructor `new` is used to initialize these values. The `get_sum` message returns the sum of `value1` and `value2`. 

