In [1]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Code_structure_research"

In [2]:
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\Maza\\Desktop\\Pinecone_pipeline'

In [54]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class CodeStructureConfig:
    root_dir: Path
    load_struct_dir: Path
    load_ignored_dir: Path
    gitignore_path: Path
    code_dir: Path
    sructure_file: Path
    models: dict
    structure_prompt: str
    files_to_ignore: set
    save_files_to_read: Path
    

In [55]:
from vector_db_pipeline.constants import *
from vector_db_pipeline.utils.common import read_yaml, save_json, create_directories, set_to_txt,load_json
from vector_db_pipeline import logger
from dotenv import load_dotenv
load_dotenv()
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
# 

In [56]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        models_filepath = MODELS_FILE_PATH,
        prompt_template = PROMPT_FILE_PATH,
        files_to_ignore = IGNORE_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)
        self.models = read_yaml(models_filepath)
        self.prompt_template = read_yaml(prompt_template)
        self.files_to_ignore = read_yaml(files_to_ignore)
    
        

    
    def get_code_structure_config(self) -> CodeStructureConfig:
        config = self.config.code_structure
        prompt_teplate = self.prompt_template.generate_file_structure
        create_directories([config.root_dir])
        
        code_structure_config = CodeStructureConfig(
            root_dir=config.root_dir,
            load_struct_dir = config.load_struct_dir,
            load_ignored_dir = config.load_ignored_dir,
            gitignore_path = config.gitignore_path,
            code_dir = config.code_dir,
            sructure_file=config.sructure_file,
            save_files_to_read = config.save_files_to_read,
            models = self.models,
            structure_prompt = prompt_teplate.description,
            files_to_ignore = self.files_to_ignore.IGNORE_FILES
            
        ) 

        return code_structure_config
    

In [57]:
"""
A class for managing code structure and formatting.

Attributes:
    config (CodeStructureConfig): Configuration object for the CodeStructure class.

Functions:
    get_ignored_subdirs_from_gitignore: Reads ignored directories and extensions from a .gitignore file.
    explore_directory: Explores directories and files, excluding ignored ones.
    build_directory_structure: Builds the directory structure recursively.
    get_formated_strcuture: Formats the directory structure using an AI model.
"""

class CodeStructure:
    def __init__(self, config:CodeStructureConfig):
        """
        Initializes the CodeStructure object with the given configuration.

        Args:
            config (CodeStructureConfig): Configuration object for the CodeStructure class.
        """        
        self.config = config        

    def get_ignored_dirs(self):
        """
        Reads ignored directories and extensions from a .gitignore file and from exhalation_ignnore file.

        Returns:
            None
        """
        ignore_subdirs_files = []
        ignore_subdirs_files_extentions = []
        gitignore_path = self.config.gitignore_path
        ignored_fles_path = self.config.load_ignored_dir
        files_to_ignore = self.config.files_to_ignore
        try:
            with open(gitignore_path, "r") as file:
                for i,line in enumerate(file):
                    # Skip comments and empty lines
                    line = line.strip()
                    if not line or line.startswith("#"):
                        continue
                    else:
                        if line.startswith("*"):
                            if line.endswith("/"):
                                ignore_subdirs_files_extentions.append(line[1:-1])
                            else:
                                ignore_subdirs_files_extentions.append(line[1:])
                        else:
                            if line.endswith("/"):
                                ignore_subdirs_files.append(line[:-1])
                            elif line.endswith("*"):
                                ignore_subdirs_files.append(line[:-2])
                            
                            else:
                                ignore_subdirs_files.append(line)
            logger.info(f"Files to ignore obtained from: {gitignore_path} and eshalation_ignore")
        except FileNotFoundError:
            return logger.error(f"Warning: {gitignore_path} not found.")
        except Exception as e:
            return logger.error(f"Error while reading {gitignore_path}: {e}")

        self.ignored_subdirs = set(ignore_subdirs_files)
        #adding files to ignore from exhalation_ignore
        self.ignored_subdirs.update(files_to_ignore)
        self.ignored_extensions = set(ignore_subdirs_files_extentions)
        try:
            all_ignored_files = self.ignored_subdirs.union(self.ignored_extensions)
            logger.info(f"Set of files to ignore created")
            set_to_txt(Path(ignored_fles_path),all_ignored_files)
            return all_ignored_files
      
        except Exception as e:
            return logger.error(f"Error while loading ignored files to {ignored_fles_path}: {e}")
        

        
    
    def explore_directory(self,directory):
        """
        Explores directories and files, excluding ignored ones.

        Args:
            directory (str): Path to the directory to explore.

        Returns:
            dict: A dictionary containing the list of directories and files.
        """
        directories = []
        files = []
        for item in os.listdir(directory):
            if item not in self.ignored_subdirs and not item.endswith(tuple(self.ignored_extensions)):
                item_path = os.path.join(directory, item)

                if os.path.isdir(item_path):
                    directories.append(item)
                else:
                    files.append(item)

        return {'Directories': directories, 'Files': files}

    def build_directory_structure(self):
        """
        Builds the directory structure recursively.

        Returns:
            dict: A dictionary representing the directory structure.
        """
        directory_structure = {}
        root_directory = self.config.code_dir
        dir_structure_file = self.config.load_struct_dir
        def explore_and_build(directory):
            dir_path = os.path.join(root_directory, directory)
            directory_structure[directory] = self.explore_directory(dir_path)
            
            for subdir in directory_structure[directory]['Directories']:
                explore_and_build(os.path.join(directory, subdir))
        
        explore_and_build(root_directory)
        
        
        save_json(Path(dir_structure_file), directory_structure)
        logger.info(f"Directory structure loaded to {dir_structure_file}")
        return  directory_structure

    def get_formated_strcuture(self, directory_structure):
        """
        Formats the directory structure using an AI model.

        Args:
            directory_structure (dict): The directory structure to format.

        Returns:
            None
        """
        try:
            formated_structure_file = self.config.sructure_file
            model = self.config.models.Llama3
            logger.info(f"Working with model: {model}")
            chat = ChatGroq(temperature=0, model_name=model)
            file_structure_prompt = self.config.structure_prompt
            prompt = ChatPromptTemplate.from_messages([("human", file_structure_prompt)])
            chain = prompt | chat
            fromated_structure = chain.invoke({"JSON_FILE": directory_structure})
            with open(formated_structure_file, "w") as f:
                f.write(fromated_structure.content)
            # return logger.info(f"Formated file structure loaded to : {formated_structure_file}")
        except Exception as e:
            return logger.error(f"Error while formating structure: {e}")
        

    def files_in_app(self):
        """
        Retrieves file paths from the directory structure specified in the schema.

        This method reads a schema file, iterates through directories and subdirectories,
        and collects all file paths mentioned in the schema.

        Returns:
            list[Path]: List of file paths extracted from the schema.
        """
        try:
            print(self.config.load_ignored_dir)
            schema = load_json(Path(self.config.load_struct_dir))
        except Exception as e:
            logger.error(f"Error loading schema file: {e}")
            return []

        file_paths = []
        
        # Iterate through each directory in the schema
        for directory, contents in schema.items():
            if "Files" in contents:
                # If the directory contains files
                files = contents.get("Files", [])
                # Create full paths for each file in the directory
                directory_path = Path(directory)
                file_paths.extend([directory_path / file for file in files])
            
            if "Subdirectories" in contents:
                # If the directory contains subdirectories, recursively call the function
                try:
                    subdirectory_paths = self.files_in_app(contents["Subdirectories"])
                    # Append subdirectory paths to the list
                    file_paths.extend(subdirectory_paths)
                except Exception as e:
                    logger.error(f"Error processing subdirectory: {e}")
        
        set_to_txt(Path(self.config.save_files_to_read),set(file_paths))
        logger.info(f"Files to read saved in : {self.config.save_files_to_read}")
        

 



In [58]:
import time

In [59]:
start = time.time()
config = ConfigurationManager()
code_structure_config = config.get_code_structure_config()
get_code_structure = CodeStructure(config=code_structure_config)
all_ignored_files = get_code_structure.get_ignored_dirs()
directory_structure = get_code_structure.build_directory_structure()
# get_code_structure.get_formated_strcuture(directory_structure)
files_in_app = get_code_structure.files_in_app()
logger.info(f"Code Schema latency: {(time.time() - start):.4f} seconds")

[2024-05-22 13:41:21,966: INFO: common: yaml file: config\config.yaml loaded successfully:]
[2024-05-22 13:41:21,968: INFO: common: yaml file: schema.yaml loaded successfully:]
[2024-05-22 13:41:21,971: INFO: common: yaml file: params.yaml loaded successfully:]
[2024-05-22 13:41:21,973: INFO: common: yaml file: models.yaml loaded successfully:]
[2024-05-22 13:41:21,978: INFO: common: yaml file: prompt_template.yaml loaded successfully:]
[2024-05-22 13:41:21,980: INFO: common: yaml file: exhalation_ignore.yaml loaded successfully:]
[2024-05-22 13:41:21,981: INFO: common: Directory already exists: artifacts/app_schema:]
[2024-05-22 13:41:21,983: INFO: 2987747724: Files to ignore obtained from: .gitignore and eshalation_ignore:]
[2024-05-22 13:41:21,984: INFO: 2987747724: Set of files to ignore created:]
[2024-05-22 13:41:21,992: INFO: common: json file saved at: artifacts\app_schema\schema.json:]
[2024-05-22 13:41:21,993: INFO: 2987747724: Directory structure loaded to artifacts/app_sche