In [1]:
import os
import glob
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain_community.embeddings import HuggingFaceEmbeddings  
from openai import OpenAI
from langchain_community.chat_models import ChatOllama


cargar los datos

In [96]:
archivos = glob.glob("datos_vectores/*")
archivos


['datos_vectores\\set_1.md',
 'datos_vectores\\set_2.md',
 'datos_vectores\\set_3.md']

In [97]:
documents = []   # list that we are going to devide in chunks later
for archivo in archivos:
    doc_type = os.path.basename(archivo)
    loader = TextLoader(archivo ,  encoding='utf-8')
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)




In [98]:
doc.metadata

{'source': 'datos_vectores\\set_3.md', 'doc_type': 'set_3.md'}

Create te chunks

In [99]:
char_text_split = CharacterTextSplitter( chunk_size=200, chunk_overlap=50 )
chunks = char_text_split.split_documents(documents)

Created a chunk of size 356, which is longer than the specified 200
Created a chunk of size 429, which is longer than the specified 200
Created a chunk of size 346, which is longer than the specified 200
Created a chunk of size 423, which is longer than the specified 200
Created a chunk of size 426, which is longer than the specified 200
Created a chunk of size 446, which is longer than the specified 200
Created a chunk of size 401, which is longer than the specified 200
Created a chunk of size 513, which is longer than the specified 200


Embedding

In [100]:
from langchain_community.embeddings import HuggingFaceEmbeddings  
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [101]:
MODEL = "phi3:latest" 

# nombre de la base de datos vectorial
db_name = "prova_vectores_db"

In [102]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [104]:
# create the chroma vector data base
vectorsDB = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)


In [105]:
print(f"Vectorstore created with {vectorsDB._collection.count()} documents")


Vectorstore created with 17 documents


In [106]:
# how many dimentions has the vector?
print("Dimentions:", len(vectorsDB._collection.get(limit=1, include=["embeddings"])["embeddings"][0]))

Dimentions: 384


In [107]:
doc.metadata['doc_type']

'set_3.md'

Vicualization

In [108]:
# Prework

result = vectorsDB._collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red'][['set_3.md', 'set_2.md', 'set_1.md'].index(t)] for t in doc_types]

In [109]:
print(f"Number of samples: {vectors.shape[0]}")

Number of samples: 17


In [110]:
"""perplexity es la cantidad de numeros cercanos posibles a un dato N. Por eso debe ser menor que
el numero de muestras. 
"""
tsne = TSNE(n_components=2, random_state=42, perplexity=2)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)


the model has been able to clasify correctly the document subjects