In [1]:
# from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.pydantic_v1 import BaseModel, Field

from bs4 import BeautifulSoup
import requests
import os
import tempfile
import streamlit as st
import pandas as pd
import dotenv

In [None]:
from bs4 import BeautifulSoup
import requests

url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32001L0029'
response = requests.get(url)

if response.status_code == 200:
    print('Internet connected')
    soup = BeautifulSoup(response.text, 'html.parser')

    main_content = soup.find('div', {'id': 'TexteOnly'}) 

    if main_content:
        all_text = []
        paragraphs = main_content.find_all('p')
        for i, p in enumerate(paragraphs):
            if i == 0:
                continue
            paragraph_text = p.get_text(strip=True)
            all_text.append(paragraph_text)
        full_text = "\n".join(all_text)
    else:
        print("Main content not found.")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " "]
)

chunks = text_splitter.split_text(full_text)
from langchain_ollama import OllamaEmbeddings, ChatOllama

embeddings_ollama = OllamaEmbeddings(model="mxbai-embed-large")
model_ollama = ChatOllama(model="llama3.2",temperature=0)

In [None]:
import shutil
import os

def clear_vectorstore(vectorstore_path):
    if os.path.exists(vectorstore_path):
        shutil.rmtree(vectorstore_path)  # Removes the entire directory and its contents
        print(f"Vectorstore at '{vectorstore_path}' has been cleared.")
    else:
        print(f"No vectorstore found at '{vectorstore_path}'.")

# Example usage
clear_vectorstore("data/vectorstore")

In [None]:
def create_vectorstore(chunks, embedding_function, vectorstore_path):
    unique_chunks = list(set(chunks))

    vectorstore = Chroma.from_texts(
        texts=unique_chunks,
        embedding=embedding_function,
        persist_directory=vectorstore_path
    )
    return vectorstore

In [None]:
def insert_if_not_exists(chunks, embedding_function, vectorstore_path):
    try:
        vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embedding_function)
    except:
        vectorstore = Chroma.from_texts(texts=[], embedding=embedding_function, persist_directory=vectorstore_path)

    new_chunks = []
    for chunk in chunks:
        embedding = embedding_function.embed_query(chunk)
        results = vectorstore.similarity_search_by_vector(embedding, k=1)
        if not results or results[0].page_content != chunk:
            new_chunks.append(chunk)

    if new_chunks:
        embeddings = [embedding_function.embed_query(chunk) for chunk in new_chunks]
        vectorstore.add_texts(texts=new_chunks, embeddings=embeddings)
        print(f"Inserted {len(new_chunks)} new chunks.")
    else:
        print("No new data to insert.")

# Example usage
vectorstore = insert_if_not_exists(
    chunks=chunks, 
    embedding_function=embeddings_ollama, 
    vectorstore_path="data/vectorstore/chromadb"
)


In [3]:
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""
vectorstore_path='data/vectorstore/chromadb'

In [None]:
def retrieve_data_from_vectorstore(question):
    try:
        vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embeddings_ollama)
        retriever = vectorstore.as_retriever(search_type="similarity")
        relevant_chunks = retriever.invoke("who will provide adequate legal protection against the manufature of devices which have purpose of bypassing a technological protection measure.")
        print (relevant_chunks)
        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)
        
        prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
        rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | model_ollama
        )
        response = rag_chain.invoke(question)
        return response
    except Exception as e:
        print(f"Error: {str(e)}")
        return "An error occurred while retrieving data."


In [None]:
retrieve_data_from_vectorstore("who will provide adequate legal protection against the manufature of devices which have purpose of bypassing a technological protection measure.")

embedding_function=embeddings_ollama
vectorstore_path='data/vectorstore_test'
vectorstore = Chroma.from_texts(texts=['con meo ngu ngoc','con ca biet di'], embedding=embedding_function, persist_directory=vectorstore_path)
embedding = embedding_function.embed_query('con meo ngu ngoc')
results = vectorstore.similarity_search_by_vector(embedding, k=1)
results

In [None]:
vectorstore = Chroma(persist_directory="data/vectorstore_test", embedding_function=embeddings_ollama)

In [None]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("who will provide adequate legal protection against the manufature of devices, products or components or the provision of services which:")
relevant_chunks

In [None]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])
context_text
# # Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="who will provide adequate legal protection against the manufature of devices which have purpose of bypassing a technological protection measure.")
print(prompt)

In [None]:
model_ollama.invoke(prompt)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

retriever = vectorstore.as_retriever(search_type="similarity")
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | model_ollama
        )
rag_chain.invoke("who will provide adequate legal protection against the manufature of devices which have purpose of bypassing a technological protection measure.")

In [None]:
from bs4 import BeautifulSoup
import requests

def get_visible_text_from_html(url):
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve the webpage.")
        return None

    print("Internet connected")
    soup = BeautifulSoup(response.text, 'html.parser')
    for element in soup(['script', 'style']):
        element.decompose()
    text = soup.get_text(separator="\n", strip=True)
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    full_text = "\n".join(lines)

    print("Data collected")
    return full_text


url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32019L1024'
text_content = get_visible_text_from_html(url)
print("Final collected text:", text_content)


In [2]:
from src.functions import retrieve_answer, clear_vectorstore,categories_urls

In [None]:
clear_vectorstore('data/vectorstore')

No vectorstore found at 'data/vectorstore'.
Processing category: Intellectual Property
Internet connected
Data collected
Inserted 45 new chunks.
URL https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32001L0029 has been logged as processed.
Internet connected
Data collected
Inserted 138 new chunks.
URL https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32019L0790 has been logged as processed.
Internet connected
Data collected
Inserted 27 new chunks.
URL https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31996L0009 has been logged as processed.
Internet connected
Data collected
Inserted 55 new chunks.
URL https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016L0943 has been logged as processed.
Processing category: Artificial Intelligence
Internet connected
Data collected
Inserted 504 new chunks.
URL https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:L_202401689 has been logged as processed.
Processing category: Digital Services


In [5]:

from langchain_ollama import OllamaEmbeddings, ChatOllama
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
model_ollama = ChatOllama(model="llama3.2", temperature=0)
embeddings_ollama = OllamaEmbeddings(model="MXBAI-EMBED-LARGE")
vectorstore = Chroma(persist_directory='data/vectorstores/Intellectual_Property', embedding_function=embeddings_ollama)
retriever = vectorstore.as_retriever(search_type="similarity")
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt_template
| model_ollama
)
response = rag_chain.invoke('Who is Bob?')


In [6]:
response

AIMessage(content="I don't know who Bob is, as there is no mention of a person named Bob in the provided context.", additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2024-11-10T18:19:50.656761Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 7209729167, 'load_duration': 20854042, 'prompt_eval_count': 802, 'prompt_eval_duration': 5436000000, 'eval_count': 24, 'eval_duration': 1752000000}, id='run-55108460-efc0-4543-b4cc-b145a564dcbc-0', usage_metadata={'input_tokens': 802, 'output_tokens': 24, 'total_tokens': 826})