Your Full Name: 
    
`Arindam Choudhury`

    Nutan Mandale
    
    Humberto Gonzalez Granda

Your Uplevel Email Address:
    
    arindam.choudhury.email@gmail.com
    
    nutan.mandale@gmail.com
    
    HumbertoGonzalezGranda@gmail.com

Name of the Problem Statement of Submission:
    
    ShopTalk (Project-6)

In [1]:
import os
import sys
import glob
import shutil
import pandas as pd
from dotenv import load_dotenv
from src.exception import CustomException
from src.logger import logging
from src.utils import upload_directory_to_s3
from dataclasses import dataclass,field
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@dataclass
class DataEmbeddingConfig:
    model: str = None
    
    load_dotenv()
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

    ABO_BUCKET_NAME:        str = os.getenv("ABO_BUCKET_NAME")
    YOUR_S3_BUCKET_NAME:    str = os.getenv("YOUR_S3_BUCKET_NAME")
    ARTIFACTS_FOLDER:       str = os.getenv("ARTIFACTS_FOLDER")
    WORKING_DIR:            str = os.getenv("WORKING_DIR")
    EDA_FOLDER_NAME:        str = os.getenv("EDA_FOLDER_NAME")

    json_file_s3:     str = f"s3://{YOUR_S3_BUCKET_NAME}/{EDA_FOLDER_NAME}/dataset.json" # Used EDA file for embedding
    json_file_local:  str = f"{WORKING_DIR}/dataset_copy.json"

    vector_db_s3: str = field(init=False)
    embeddings: object  = field(init=False)
    
    if not os.path.exists(WORKING_DIR):
        os.makedirs(WORKING_DIR)

    def __post_init__(self):
        if self.model == "google":
            logging.info("Data Embedding - Embedding Model selected is Google.")
            self.vector_db_s3: str = "GOOGLE_FAISS_DB"
            self.vector_db_local: str = f"{self.WORKING_DIR}GOOGLE_FAISS_DB"
            self.embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")         
        elif self.model == "openai":
            logging.info("Data Embedding - Embedding Model selected is OpenAI.")
            self.vector_db_s3: str = "OPENAI_FAISS_DB"
            self.vector_db_local: str = f"{self.WORKING_DIR}OPENAI_FAISS_DB"
            self.embeddings = OpenAIEmbeddings()
        elif self.model == "finetune":
            logging.info("Data Embedding - Embedding Model selected is HuggingFace Finetune model.")
            self.vector_db_s3: str = "FINETUNE_FAISS_DB"
            self.vector_db_local: str = f"{self.WORKING_DIR}FINETUNE_FAISS_DB"
            FINETUNE_MODEL_PATH: str = f"s3://{self.YOUR_S3_BUCKET_NAME}/FINETUNE/finetuned_model"
            self.embeddings = HuggingFaceEmbeddings(model_name = FINETUNE_MODEL_PATH)

In [4]:
class DataEmbedding:
    def __init__(self, model):
        self.ingestion_config=DataEmbeddingConfig(model)
    
    def initiate_data_embedding(self):
        logging.info("Data Embedding - started")
        try:
            file_path = self.ingestion_config.json_file_s3
            dataset = pd.read_json(file_path)
            dataset.to_json(self.ingestion_config.json_file_local, orient='records')
            logging.info("Data Embedding - Json file loaded from AWS S3 to dataframe")

            loader = JSONLoader(file_path=self.ingestion_config.json_file_local, jq_schema=".[]", text_content=False)
            documents = loader.load()
            vectors_db = FAISS.from_documents(documents, self.ingestion_config.embeddings)
            logging.info("Data Embedding - FIASS DB created successfully")

            vectors_db.save_local(self.ingestion_config.vector_db_local)
            logging.info("Data Embedding - FIASS DB Loading to AWS S3 started")
            upload_directory_to_s3(self.ingestion_config.YOUR_S3_BUCKET_NAME, self.ingestion_config.vector_db_s3, self.ingestion_config.vector_db_local)

            if os.path.exists(self.ingestion_config.WORKING_DIR):
                shutil.rmtree(self.ingestion_config.WORKING_DIR)
                print(f"{self.ingestion_config.WORKING_DIR} folder removed successfully.")   

            logging.info("Data Embedding - completed")
        
        except Exception as e:
            raise CustomException(e, sys)

In [5]:
if __name__=="__main__":
    model = "google"
    #model = "openai"
    #model = "finetune"
    obj=DataEmbedding(model)
    data = obj.initiate_data_embedding()

Uploaded GOOGLE_FAISS_DB/index.faiss to S3 bucket shopchat-s3-buckect
Uploaded GOOGLE_FAISS_DB/index.pkl to S3 bucket shopchat-s3-buckect
Download folder removed successfully.
