In [0]:
%run ./configs

In [0]:
class ProfileIngestion:
    def __init__(self):
        pass

    def make_source_dir(self):
        src_dir = (
            spark.sql(
            f"""select date_add(last_load_date, 1) as source_dir 
            from {ats_configs.jobs_metadate_table_name} 
            where job_name = '{ats_configs.profile_ingestion_job_name}'
            order by last_load_date desc
            """).first()
                .asDict()["source_dir"]
        )
        return src_dir
    
    def cleanup_destination_dir(self, dest_path):
        print(f"Deleting {dest_path}")
        dbutils.fs.rm(dest_path, recurse=True)
        print(f"Deleted {dest_path} if already exists")

        dbutils.fs.mkdirs(dest_path)
        print(f"Created {dest_path}")
    
    def update_metadata(self, ingestion_date):
        print(f"Updating metadata for {ats_configs.profile_ingestion_job_name}")
        spark.sql(
            f"""
            insert into {ats_configs.jobs_metadate_table_name}
            values('{ats_configs.profile_ingestion_job_name}', ingestion_date, current_timestamp(), 'Job Execution Completed.' )""")
        print(f"Updated metadata for {ats_configs.profile_ingestion_job_name}")

    def ingest_profiles():
        print("Ingesting profiles")
        import requests        
        from concurrent.futures import ThreadPoolExecutor
        import collections

        download_dir = self.make_source_dir()
        dest_path = f"/Volumes/{ats_configs.catalog}/{ats_configs.db}/{ats_configs.profile_landing_zone}/{download_dir}"
        self.cleanup_destination_dir(dest_path)

        api_url = f"https://api.github.com/repos/{ats_configs.owner}/{ats_configs.repo}/contents/{ats_configs.profile_source}/{download_dir}"
        files = requests.get(api_url).json()
        download_urls = [file["download_url"] for file in files]

        def download_files(download_url):
            filename = download_url.split("/")[-1]
            with requests.get(download_url, stream=True) as r:                
                with open(f"{dest_path}/{filename}", "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)               

        print(f"Downloading files...", end='')
        with ThreadPoolExecutor(max_workers=4) as executors:
            collections.deque(executors.map(download_files, download_urls))
        print("Done")
        return dest_path
        
    def assert_file_count(self, dir_name, expected_count):
        file_count = len([file.path for file in dbutils.fs.ls(dir_name)])
        assert file_count == expected_count, f"Expected {expected_count} files in {dir_name} but found {file_count}"
        print(f"Found {file_count} files in {dir_name}. Successful match expected count")
        
    def validate(self, iter):
        import time
        start = time.time()
        print("Validating files")
        for file in iter:
        self.assert_file_count('2025-07-01' if iter == 1 else '2025-07-02', 5)
        end = time.time()
        print(f"Validated files in {end - start} seconds")
        
    