### Imports

In [1]:
import os
from langchain.document_loaders import PyPDFLoader

In [2]:
from langchain_community.vectorstores.faiss import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_ollama import OllamaEmbeddings

modelEmb = 'jina/jina-embeddings-v2-base-es'
embeddings = OllamaEmbeddings(model=modelEmb, base_url='http://localhost:11435') 

 ### PDFs

In [4]:
carpeta_pdfs = "../datos/documentos"

pages = []	

for archivo in os.listdir(carpeta_pdfs):
    if archivo.endswith(".pdf"):  
        ruta_pdf = os.path.join(carpeta_pdfs, archivo)
        loader = PyPDFLoader(ruta_pdf)
        documentos = loader.load_and_split()
        pages += documentos

In [5]:
pages

[Document(metadata={'producer': 'Antenna House PDF Output Library 6.4.928 (Windows (x64))', 'creator': 'AH XSL Formatter V6.4 R1 for Windows (x64) : 6.4.2.26942 (2016/12/07 15:30JST)', 'creationdate': '2018-05-08T10:16:09-08:00', 'author': 'OSIsoft, LLC.', 'moddate': '2018-05-08T13:53:45-04:00', 'subject': 'PI System Explorer', 'title': 'PI System Explorer User Guide', 'trapped': '/False', 'source': '../datos/documentos\\PI-System-Explorer-2018-User-Guide_EN (1).pdf', 'total_pages': 578, 'page': 0, 'page_label': 'i'}, page_content='PI System Explorer\nUser Guide\nFor PI Asset Framework 2.10 included with PI Server 2018'),
 Document(metadata={'producer': 'Antenna House PDF Output Library 6.4.928 (Windows (x64))', 'creator': 'AH XSL Formatter V6.4 R1 for Windows (x64) : 6.4.2.26942 (2016/12/07 15:30JST)', 'creationdate': '2018-05-08T10:16:09-08:00', 'author': 'OSIsoft, LLC.', 'moddate': '2018-05-08T13:53:45-04:00', 'subject': 'PI System Explorer', 'title': 'PI System Explorer User Guide'

In [6]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

pdfs = text_splitter.split_documents(pages)

In [7]:
pdfs

[Document(metadata={'producer': 'Antenna House PDF Output Library 6.4.928 (Windows (x64))', 'creator': 'AH XSL Formatter V6.4 R1 for Windows (x64) : 6.4.2.26942 (2016/12/07 15:30JST)', 'creationdate': '2018-05-08T10:16:09-08:00', 'author': 'OSIsoft, LLC.', 'moddate': '2018-05-08T13:53:45-04:00', 'subject': 'PI System Explorer', 'title': 'PI System Explorer User Guide', 'trapped': '/False', 'source': '../datos/documentos\\PI-System-Explorer-2018-User-Guide_EN (1).pdf', 'total_pages': 578, 'page': 0, 'page_label': 'i'}, page_content='PI System Explorer\nUser Guide\nFor PI Asset Framework 2.10 included with PI Server 2018'),
 Document(metadata={'producer': 'Antenna House PDF Output Library 6.4.928 (Windows (x64))', 'creator': 'AH XSL Formatter V6.4 R1 for Windows (x64) : 6.4.2.26942 (2016/12/07 15:30JST)', 'creationdate': '2018-05-08T10:16:09-08:00', 'author': 'OSIsoft, LLC.', 'moddate': '2018-05-08T13:53:45-04:00', 'subject': 'PI System Explorer', 'title': 'PI System Explorer User Guide'

In [8]:
vector_store = FAISS.from_documents(pdfs, embeddings)

In [9]:
vector_store.save_local("../datos/vdb/ollama/pdfs")

### Tickets

In [10]:
import pandas as pd
from langchain.schema import Document
df = pd.read_csv("../datos/clean_data/datos_censurados_agrupados.csv")

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2748 entries, 0 to 2747
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   titulo       2748 non-null   object
 1   descripcion  2748 non-null   object
 2   solucion     1130 non-null   object
 3   categories   2748 non-null   object
dtypes: object(4)
memory usage: 86.0+ KB


In [12]:
tickets = [
    Document(
        page_content=f"Título: {row['titulo']}\nDescripción: {row['descripcion']}\nSolución: {row['solucion']}",
        metadata={"´Categoria": row['categories']}
    )
    for idx, row in df.iterrows()
]

In [18]:
tickets

<langchain_community.vectorstores.faiss.FAISS at 0x1e5d01e18b0>

In [13]:
vector_store = FAISS.from_documents(tickets, embeddings)

In [14]:
vector_store.save_local("../datos/vdb/ollama/tickets")

### Merge

In [15]:
tickets = FAISS.load_local("../datos/vdb/ollama/tickets", embeddings, allow_dangerous_deserialization=True)
pdfs = FAISS.load_local("../datos/vdb/ollama/pdfs", embeddings, allow_dangerous_deserialization=True)

pdfs.merge_from(tickets)

In [16]:
pdfs.save_local("../datos/vdb/ollama/knowledgeBase")

In [None]:
pdfs

<langchain_community.vectorstores.faiss.FAISS at 0x1e5cd92e240>