
# TrainModelService Notebook
This notebook outlines the process of training a model service, including:
- Installing required packages
- Importing necessary libraries
- Setting up environment variables
- Initializing an OpenAI client
- Processing PDF documents for embeddings
- Interacting with Google Cloud Storage

Please ensure you have set up the following environment variables before running:
- `GOOGLE_APPLICATION_CREDENTIALS`: path to your Google Cloud credentials JSON file
- `OPENAI_API_KEY`: your OpenAI API key

**Note**: Never upload sensitive credentials to a public repository. Always use environment variables and `.gitignore` to handle sensitive information.


In [34]:
# !pip install PyPDF2
# !pip install langchain
# !pip install openai
# !pip install tiktoken
# !pip install faiss-cpu

In [35]:
# Importamos las librerías

from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [36]:
# !pip install python-dotenv

import os
from dotenv import load_dotenv

# Carga las variables de entorno desde el archivo .env en el directorio actual
load_dotenv()

In [37]:
from openai import OpenAI

client = OpenAI()
# defaults to getting the key using os.environ.get("OPENAI_API_KEY")
# if you saved the key under a different environment variable name, you can do something like:
# client = OpenAI(
#   api_key=os.environ.get("CUSTOM_ENV_NAME"),
# )

In [38]:
class EmbeddingPDF():
    def __init__(self, pdf):
        self.pdf = pdf
        
    def getEmbedding(self):
        if self.pdf is not None:
            leer_pdf = PdfReader(self.pdf)
            text = ""
            for page in leer_pdf.pages:
                text += page.extract_text()
                
            # Configuraciomos los splits del documento
            text_splitter = CharacterTextSplitter(
                separator = "\n",
                chunk_size = 1000,
                length_function = len
            )
                
            splits = text_splitter.split_text(text)
                
            embeddings = OpenAIEmbeddings()
                
            baseConocimiento = FAISS.from_texts(splits, embeddings)
                
            baseConocimiento.save_local("vectorStore/faiss_index")

In [39]:
pdf_file = "ruta-a-tu-archivo-pdf"

In [40]:
baseConocimiento = EmbeddingPDF(pdf_file)
baseConocimiento.getEmbedding()

In [41]:
# !pip install google-cloud
# !pip install google-cloud-storage

In [42]:
from google.cloud import storage

class VectorManagerGoogleCloud():
    def __init__(self):
        self.project_id = "tu-project-id"
        self.storage_client = ""
        self.folder_name = ""
        self.bucket_name = "tu-bucket-id"
    
    def connect_gcp(self, service_account_json):
        self.storage_client = storage.Client.from_service_account_json(service_account_json)
    
    def create_folder(self, folder_name):
        self.storage_client.get_bucket(self.bucket_name).blob(folder_name+"/READMI.txt").upload_from_string("vectores")
        
    def upload_file(self, blob_name, file_path):
        try:
            bucket = self.storage_client.get_bucket(self.bucket_name)
            blob = bucket.blob("PdfVectorStore/faiss_index/"+blob_name)
            blob.upload_from_filename(file_path)
            
            return True
        except:
            return False

In [43]:
SERVICE_ACCOUNT_CREDENTIALS = os.environ["GOOGLE_APPLICATION_CREDENTIALS"]

In [44]:
vmgc = VectorManagerGoogleCloud()
vmgc.connect_gcp(SERVICE_ACCOUNT_CREDENTIALS)

In [45]:
vmgc.create_folder("PdfVectorStore/faiss_index")

In [46]:
vmgc.upload_file("index.faiss", "vectorStore/faiss_index/index.faiss")
vmgc.upload_file("index.pkl", "vectorStore/faiss_index/index.pkl")