In [None]:
import sys
import random

sys.path.append("../")

from dotenv import load_dotenv
import os

load_dotenv()


from services.files.directory_file_organizer import DirectoryFileOrganizer
from services.files.metadata_extraction import FileMetadataExtractor

directory_path = "../../data/files"

directory_file_organizer = DirectoryFileOrganizer()
all_files = directory_file_organizer.list_files_recursive(directory_path)
all_files = directory_file_organizer.group_files_by_type(all_files)
flattened_files = [file for sublist in all_files.values() for file in sublist]
len(flattened_files)

In [None]:
import pickle

all_files_w_metadata = FileMetadataExtractor().extract_files_metadata_concurrent(
    flattened_files, max_workers=24
)

with open("all_files_w_metadata.pkl", "wb") as f:
    pickle.dump(all_files_w_metadata, f)

In [None]:
import pickle

with open("all_files_w_metadata.pkl", "rb") as f:
    loaded_files_w_metadata = pickle.load(f)

loaded_files_w_metadata

In [None]:
import concurrent.futures

from models.files import File
from models.file_metadata import (
    ImageMetadata,
    VideoMetadata,
    TextMetadata,
    AudioMetadata,
    ArchiveMetadata,
    DocumentMetadata,
)

from utilities.general import generate_id
from typing import Tuple, Dict, Any


def convert_file_to_serializable_dicts(
    file: File,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    def format_datetime(dt):
        return dt.strftime("%Y-%m-%d %H:%M:%S") if dt else None

    file_dict = {
        "id": file.id,
        "name": file.name,
        "type": file.type.value,
        "path": file.path,
    }

    metadata_dict = {
        "id": generate_id(),
        "file_id": file.id,
        "size": file.metadata.size,
        "created_at": format_datetime(file.metadata.created_at),
        "modified_at": format_datetime(file.metadata.modified_at),
    }

    if isinstance(file.metadata, ImageMetadata):
        metadata_dict.update(
            {
                "image_width": file.metadata.width,
                "image_height": file.metadata.height,
                "image_color_mode": file.metadata.color_mode,
                "image_format": file.metadata.format,
                "image_location": (
                    {
                        "latitude": file.metadata.location[0],
                        "longitude": file.metadata.location[1],
                    }
                    if file.metadata.location
                    else None
                ),
            }
        )
    elif isinstance(file.metadata, VideoMetadata):
        metadata_dict.update(
            {
                "video_duration": file.metadata.duration,
                "video_width": file.metadata.width,
                "video_height": file.metadata.height,
                "video_framerate": file.metadata.framerate,
                "video_codec": file.metadata.codec,
                "video_bitrate": file.metadata.bitrate,
                "video_location": (
                    {
                        "latitude": file.metadata.location[0],
                        "longitude": file.metadata.location[1],
                    }
                    if file.metadata.location
                    else None
                ),
            }
        )
    elif isinstance(file.metadata, TextMetadata):
        metadata_dict.update(
            {
                "text_num_words": file.metadata.num_words,
                "text_language": file.metadata.language,
                "text_encoding": file.metadata.encoding,
            }
        )
    elif isinstance(file.metadata, AudioMetadata):
        metadata_dict.update(
            {
                "audio_bitrate": file.metadata.bitrate,
                "audio_duration": file.metadata.duration,
                "audio_sample_rate": file.metadata.sample_rate,
                "audio_channels": file.metadata.channels,
                "audio_codec": file.metadata.codec,
            }
        )
    elif isinstance(file.metadata, ArchiveMetadata):
        metadata_dict.update(
            {
                "archive_num_files": file.metadata.num_files,
                "archive_compression_type": file.metadata.compression_type,
                "archive_encrypted": file.metadata.encrypted,
            }
        )
    elif isinstance(file.metadata, DocumentMetadata):
        metadata_dict.update(
            {
                "document_num_pages": file.metadata.num_pages,
                "document_author": file.metadata.author,
                "document_title": file.metadata.title,
                "document_language": file.metadata.language,
            }
        )

    return file_dict, metadata_dict


def convert_files_concurrently(files):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(convert_file_to_serializable_dicts, files))
    return results


file_dicts = convert_files_concurrently(loaded_files_w_metadata)
file_dicts

In [None]:
len(file_dicts)

In [None]:
import os
from dbio.supabase import SupabaseDatabaseAdapter

db = SupabaseDatabaseAdapter(
    url=os.environ["SUPABASE_URL"], key=os.environ["SUPABASE_SECRET_KEY"]
)

In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)

files = pd.DataFrame([file[0] for file in file_dicts])
metadata = pd.DataFrame([file[1] for file in file_dicts])

In [None]:
metadata = metadata.applymap(
    lambda x: int(x) if isinstance(x, str) and x.isdigit() else x
)
metadata

In [None]:
for file in file_dicts:
    db.insert("files", file[0])

In [None]:
import numpy as np

metadata.replace({pd.NA: None}, inplace=True)
metadata.replace({np.nan: None}, inplace=True)
metadata

In [None]:
metadata

In [None]:
import json


def clean_and_insert_metadata(metadata_df):
    def clean_row(row):
        row_dict = row.to_dict()
        for key, value in row_dict.items():
            if isinstance(value, str):
                if value.isdigit():
                    row_dict[key] = int(value)
        cleaned_dict = json.loads(
            json.dumps(row_dict, default=str).replace("NaN", "null")
        )
        return cleaned_dict

    for _, row in metadata_df.iterrows():
        cleaned_row = clean_row(row)
        db.insert("file_metadata", cleaned_row)


clean_and_insert_metadata(metadata)