In [1]:
import pandas as pd
import os
import json
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from vector_db_pipeline.utils.common import list_files_in_directory, get_json
from pathlib import Path
from typing import List

In [2]:
%pwd

'c:\\Users\\Maza\\Desktop\\Pinecone_pipeline\\research'

In [3]:
os.chdir("../")

In [36]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    local_data_file: Path
    load_dir: Path
    text_spliter_config : dict
    namespace_idx:str
    


In [37]:
from vector_db_pipeline.constants import *
from vector_db_pipeline.utils.common import read_yaml, create_directories, save_json
from vector_db_pipeline import logger

In [38]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH):
        """
        Initializes ConfigurationManager with provided filepaths.

        Args:
            config_filepath (str): Filepath to configuration file. Defaults to CONFIG_FILE_PATH.
            schema_filepath (str): Filepath to schema file. Defaults to SCHEMA_FILE_PATH.
            params_filepath (str): Filepath to parameters file. Defaults to PARAMS_FILE_PATH.
        """
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)

    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Retrieves data ingestion configuration settings.

        Returns:
            data_ingestion_config (DataIngestionConfig): Data ingestion configuration object.
        """
        config = self.config.data_ingestion
        text_spliter = self.params.TEXT_SPLITER
        namespace = self.params.INDEX_INFO.NAMESPACE

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
            load_dir=config.load_dir,
            text_spliter_config=text_spliter,
            namespace_idx = namespace
        )

        return data_ingestion_config

    


In [39]:
class TextProcessor:
    def __init__(self, config: DataIngestionConfig):
        """
        Initializes TextProcessor with the provided data ingestion configuration.

        Args:
            config (DataIngestionConfig): Configuration object containing text splitting settings.
        """
        self.config = config
        
    def get_text_chunks(self, text: str) -> List[str]:
        """
        Splits input text into chunks based on configuration settings.

        Args:
            text (str): Input text to be split into chunks.

        Returns:
            chunks (List[str]): List of text chunks.
        """
        text_splitter_config = self.config.text_spliter_config
        text_splitter = CharacterTextSplitter(
            separator=text_splitter_config.SEPARATOR.encode().decode('unicode_escape'),
            chunk_size=text_splitter_config.CHUNK_SIZE,
            chunk_overlap=text_splitter_config.CHUNK_OVERLAP,
            length_function=len
        )
        chunks = text_splitter.split_text(text)
        return chunks

    def split_text(self, data: List[dict]) -> List[dict]:
        """
        Splits text data in each dictionary entry into chunks and embeds each chunk.

        Args:
            data (List[dict]): List of dictionaries containing text data.

        Returns:
            splited_text_data (List[dict]): List of dictionaries containing split and embedded text data.
        """

        namespace = self.config.namespace_idx
        embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")
        splited_text_data = []
        idx = 0
        for d in data:
            # Start schema extraction
            text = d.get('text')
            if text:
                timestamp = d.pop('date_scraped_timestamp')
                host = d.pop('host')
                url = d.pop('url')
                page_title = d.pop('page_title')
                # End schema extraction
                
                text_chunks = self.get_text_chunks(text)
                embeded_text = embed_model.embed_documents(text_chunks)
                for i, text_chunk in enumerate(text_chunks):
                    emb_vect = {'id': namespace+'#'+str(idx), 'values': embeded_text[i], 
                                'text': text_chunk, 'host': str(host), 'page_title': str(page_title),
                                'chunk':str(i), 'url': str(url),'timestamp': str(timestamp)}
                    idx += 1
                    splited_text_data.append(emb_vect)
        logger.info(f"Text processed and chunked. Total chunks: {len(splited_text_data)}")
        return splited_text_data
    
    def load_data_json(self, splited_text_data: List[dict]):
        """
        Saves the processed data into a json file.

        Args:
            splited_text_data (List[dict]): List of dictionaries containing processed text data.
        """
        json_file_path = Path(self.config.load_dir)
        df = pd.DataFrame(splited_text_data)
        df.to_json(json_file_path, orient='records')

        logger.info(f"Data processed and saved into JSON file in {json_file_path}")





In [1]:
try:

    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    json_files = list_files_in_directory(Path(data_ingestion_config.local_data_file))
    data = get_json(json_files)
    text_processor = TextProcessor(config=data_ingestion_config)
    splited_text_data = text_processor.split_text(data[:20])
    text_processor.load_data_json(splited_text_data)
except Exception as e:
    raise e
