In [2]:
import os

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Maza\\Desktop\\Pinecone_pipeline'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ConfigFileChanges:
    """
    

    Attributes:
        
    """

    dir_to_monitor: Path
    state_file: Path
    updated_files: Path



In [6]:
from vector_db_pipeline.constants import * 
from vector_db_pipeline.utils.common import read_yaml, create_directories, save_json, read_txt_to_list
from vector_db_pipeline import logger

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH):
        """
        Initializes ConfigurationManager with provided filepaths.

        Args:
            config_filepath (str): Filepath to configuration file. Defaults to CONFIG_FILE_PATH.
            schema_filepath (str): Filepath to schema file. Defaults to SCHEMA_FILE_PATH.
            params_filepath (str): Filepath to parameters file. Defaults to PARAMS_FILE_PATH.
        """
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)

    
    def get_file_changes_config(self) -> ConfigFileChanges:
        """
       

        Returns:
           
        """
        config = self.config.file_changes
       
        create_directories([config.state_root])

        file_changes_config = ConfigFileChanges(
            dir_to_monitor = config.dir_to_monitor,
            state_file = config.state_file,
            updated_files = config.updated_files
        )

        return file_changes_config

    


In [8]:
import os
import json
import time
from hashlib import md5
from pathlib import Path



In [37]:
exclusions = read_txt_to_list(Path('artifacts/app_schema/ignored_files.txt'))

exclusions = set([str(os.path.normpath(path)) for path in exclusions])

def should_exclude(path):
    # Check if the path matches any of the exclusion patterns
    for exclusion in exclusions:
        if exclusion in path:
            return True
        if path.endswith(exclusion):
            return True
    return False

def list_files(directory):
    result = []
    for root, dirs, files in os.walk(directory):
        # Exclude directories from the list
        dirs[:] = [d for d in dirs if not should_exclude(os.path.join(root, d))]
        for file in files:
            file_path = os.path.join(root, file)
            if not should_exclude(file_path):
                result.append(file_path)
    return result

In [42]:

class FilesState:
    def __init__(self, config:ConfigFileChanges):
        self.config = config
        self.dir_to_monitor = config.dir_to_monitor
        self.state_file = config.state_file


    @staticmethod
    def get_file_md5(file_path):
        """Compute MD5 hash of the file."""
        hasher = md5()
        with open(file_path, 'rb') as f:
            buf = f.read()
            hasher.update(buf)
        return hasher.hexdigest()


    def save_directory_state(self):
        """Save the current state of the directory to a file."""
        current_state = {}
        filtered_files = list_files(self.dir_to_monitor)
        # for root, _, files in os.walk(self.dir_to_monitor):
        for file in filtered_files:
            # file_path = os.path.join(root, file)
            current_state[file] = self.get_file_md5(file)
        with open(self.state_file, 'w') as f:
            json.dump(current_state, f, indent=4)

    @staticmethod
    def load_directory_state(state_file):
        """Load the directory state from a file."""
        if not os.path.exists(state_file):
            return {}
        with open(state_file, 'r') as f:
            return json.load(f)


    def compare_states(self,old_state, new_state):
        """Compare the old and new directory states and classify changes."""
        old_files = set(old_state.keys())
        new_files = set(new_state.keys())

        added_files = new_files - old_files
        deleted_files = old_files - new_files
        common_files = old_files & new_files

        changed_files = {
            file for file in common_files if old_state[file] != new_state[file]
        }
        

        changes = {
            'added_files': list(added_files),
            'deleted_files': list(deleted_files),
            'changed_files': list(changed_files),
        }

        with open(self.config.updated_files, 'w') as f:
            json.dump(changes, f, indent=4)

       

    def monitor_directory(self):
        """Monitor the directory and report changes."""
        old_state = self.load_directory_state(self.state_file)
        new_state = {}
        filtered_files = list_files(self.dir_to_monitor)
        for file in filtered_files:
            new_state[file] = self.get_file_md5(file)
    
 

        self.compare_states(old_state, new_state)
        self.save_directory_state()





In [43]:
import time

In [48]:
start = start = time.time()
config = ConfigurationManager()
file_changes_config = config.get_file_changes_config()
get_file_state = FilesState(file_changes_config)
if not Path(file_changes_config.state_file).exists():
        get_file_state.save_directory_state()
get_file_state.monitor_directory()
logger.info(f"File state comparision latency: {(time.time() - start):.4f} seconds")

[2024-05-24 16:54:21,661: INFO: common: yaml file: config\config.yaml loaded successfully:]
[2024-05-24 16:54:21,663: INFO: common: yaml file: schema.yaml loaded successfully:]
[2024-05-24 16:54:21,666: INFO: common: yaml file: params.yaml loaded successfully:]
[2024-05-24 16:54:21,667: INFO: common: Directory already exists: artifacts/state:]
[2024-05-24 16:54:21,667: INFO: 1016437350: File state comparision latency: 0.0099 seconds:]


if added files or changedfiles write to files to read, if deleted files delete from json summary edited

