In [1]:
# from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.pydantic_v1 import BaseModel, Field

from bs4 import BeautifulSoup
import requests
import os
import tempfile
import streamlit as st
import pandas as pd
import dotenv

In [2]:
from bs4 import BeautifulSoup
import requests

url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32001L0029'
response = requests.get(url)

if response.status_code == 200:
    print('Internet connected')
    soup = BeautifulSoup(response.text, 'html.parser')

    main_content = soup.find('div', {'id': 'TexteOnly'}) 

    if main_content:
        all_text = []
        paragraphs = main_content.find_all('p')
        for i, p in enumerate(paragraphs):
            if i == 0:
                continue
            paragraph_text = p.get_text(strip=True)
            all_text.append(paragraph_text)
        full_text = "\n".join(all_text)
    else:
        print("Main content not found.")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " "]
)

chunks = text_splitter.split_text(full_text)
from langchain_ollama import OllamaEmbeddings, ChatOllama

embeddings_ollama = OllamaEmbeddings(model="mxbai-embed-large")
model_ollama = ChatOllama(model="llama3.2",temperature=0)

Internet connected


In [3]:
import shutil
import os

def clear_vectorstore(vectorstore_path):
    if os.path.exists(vectorstore_path):
        shutil.rmtree(vectorstore_path)  # Removes the entire directory and its contents
        print(f"Vectorstore at '{vectorstore_path}' has been cleared.")
    else:
        print(f"No vectorstore found at '{vectorstore_path}'.")

# Example usage
clear_vectorstore("data/vectorstore")

No vectorstore found at 'data/vectorstore'.


In [4]:
def create_vectorstore(chunks, embedding_function, vectorstore_path):
    unique_chunks = list(set(chunks))

    vectorstore = Chroma.from_texts(
        texts=unique_chunks,
        embedding=embedding_function,
        persist_directory=vectorstore_path
    )
    return vectorstore

In [5]:
def insert_if_not_exists(chunks, embedding_function, vectorstore_path):
    try:
        vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embedding_function)
    except:
        vectorstore = Chroma.from_texts(texts=[], embedding=embedding_function, persist_directory=vectorstore_path)

    new_chunks = []
    for chunk in chunks:
        embedding = embedding_function.embed_query(chunk)
        results = vectorstore.similarity_search_by_vector(embedding, k=1)
        if not results or results[0].page_content != chunk:
            new_chunks.append(chunk)

    if new_chunks:
        embeddings = [embedding_function.embed_query(chunk) for chunk in new_chunks]
        vectorstore.add_texts(texts=new_chunks, embeddings=embeddings)
        print(f"Inserted {len(new_chunks)} new chunks.")
    else:
        print("No new data to insert.")

# Example usage
vectorstore = insert_if_not_exists(
    chunks=chunks, 
    embedding_function=embeddings_ollama, 
    vectorstore_path="data/vectorstore/chromadb"
)


Inserted 49 new chunks.


In [6]:
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""
vectorstore_path='data/vectorstore/chromadb'

In [7]:
def retrieve_data_from_vectorstore(question):
    try:
        vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embeddings_ollama)
        retriever = vectorstore.as_retriever(search_type="similarity")
        relevant_chunks = retriever.invoke("who will provide adequate legal protection against the manufature of devices which have purpose of bypassing a technological protection measure.")
        print (relevant_chunks)
        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)
        
        prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
        rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | model_ollama
        )
        response = rag_chain.invoke(question)
        return response
    except Exception as e:
        print(f"Error: {str(e)}")
        return "An error occurred while retrieving data."


In [8]:
retrieve_data_from_vectorstore("who will provide adequate legal protection against the manufature of devices which have purpose of bypassing a technological protection measure.")

[Document(metadata={}, page_content='(48) Such legal protection should be provided in respect of technological measures that effectively restrict acts not authorised by the rightholders of any copyright, rights related to copyright or the sui generis right in databases without, however, preventing the normal operation of electronic equipment and its technological development. Such legal protection implies no obligation to design devices, products, components or services to correspond to technological measures, so long as such device, product, component or service does not otherwise fall under the prohibition of Article 6. Such legal protection should respect proportionality and should not prohibit those devices or activities which have a commercially significant purpose or use other than to circumvent the technical protection. In particular, this protection should not hinder research into cryptography.\n(49) The legal protection of technological measures is without prejudice to the app

AIMessage(content='According to Article 6(2)(c) of the Directive, Member States shall provide adequate legal protection against the manufacture, import, distribution, sale, rental, advertisement for sale or rental, or possession for commercial purposes of devices, products or components that are primarily designed, produced, adapted or performed for the purpose of enabling or facilitating the circumvention of any effective technological measures.', additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2024-11-09T16:14:24.557286Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 5357012750, 'load_duration': 21276542, 'prompt_eval_count': 840, 'prompt_eval_duration': 2916000000, 'eval_count': 77, 'eval_duration': 2418000000}, id='run-0b842837-36db-4ae1-a1d1-93cab8626441-0', usage_metadata={'input_tokens': 840, 'output_tokens': 77, 'total_tokens': 917})

embedding_function=embeddings_ollama
vectorstore_path='data/vectorstore_test'
vectorstore = Chroma.from_texts(texts=['con meo ngu ngoc','con ca biet di'], embedding=embedding_function, persist_directory=vectorstore_path)
embedding = embedding_function.embed_query('con meo ngu ngoc')
results = vectorstore.similarity_search_by_vector(embedding, k=1)
results

In [None]:
vectorstore = Chroma(persist_directory="data/vectorstore_test", embedding_function=embeddings_ollama)

In [None]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("who will provide adequate legal protection against the manufature of devices, products or components or the provision of services which:")
relevant_chunks

In [None]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])
context_text
# # Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="who will provide adequate legal protection against the manufature of devices which have purpose of bypassing a technological protection measure.")
print(prompt)

In [None]:
model_ollama.invoke(prompt)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

retriever = vectorstore.as_retriever(search_type="similarity")
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | model_ollama
        )
rag_chain.invoke("who will provide adequate legal protection against the manufature of devices which have purpose of bypassing a technological protection measure.")