In [9]:
import os
from pathlib import Path
# os.chdir("..")
# assert Path(os.getcwd())==Path("/f/oneNeuron/PROJECT_upgrade/DVC_DL_Tensorflow_updated")


Path(os.getcwd()), Path("/f/oneNeuron/PROJECT_upgrade/DVC_DL_Tensorflow_updated/research")

(WindowsPath('F:/oneNeuron/PROJECT_upgrade/DVC_DL_Tensorflow_updated/research'),
 WindowsPath('/f/oneNeuron/PROJECT_upgrade/DVC_DL_Tensorflow_updated/research'))

In [5]:
import tensorflow as tf
import urllib.request as request

In [10]:
import os
import sys
import logging

logging_str = "[%(asctime)s: %(levelname)s: %(module)s]: %(message)s"
log_dir = "logs"
log_filepath = os.path.join(log_dir, 'running_logs.log')
os.makedirs(log_dir, exist_ok=True)

logging.basicConfig(
    level=logging.INFO, format=logging_str,
    handlers=[
        logging.FileHandler(log_filepath),#, mode="a"),
        # logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger("app")

In [11]:
import os
from box.exceptions import BoxValueError
import yaml
import json
import joblib
from ensure import ensure_annotations
from box import ConfigBox
from pathlib import Path
from typing import Any

@ensure_annotations
def read_yaml(path_to_yaml: Path) -> ConfigBox:
    """reads yaml file and returns

    Args:
        path_to_yaml (str): path like input

    Raises:
        ValueError: if yaml file is empty
        e: empty file

    Returns:
        ConfigBox: ConfigBox type
    """
    try:
        with open(path_to_yaml) as yaml_file:
            content = yaml.safe_load(yaml_file)
            logger.info(f"yaml file: {path_to_yaml} loaded successfully")
            return ConfigBox(content)
    except BoxValueError:
        raise ValueError("yaml file is empty")
    except Exception as e:
        raise e

@ensure_annotations
def create_directories(path_to_directories: list, verbose=True):
    """create list of directories

    Args:
        path_to_directories (list): list of path of directories
        ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
    """
    for path in path_to_directories:
        os.makedirs(path, exist_ok=True)
        if verbose:
            logger.info(f"created directory at: {path}")

@ensure_annotations
def save_json(path: Path, data: dict):
    """save json data

    Args:
        path (Path): path to json file
        data (dict): data to be saved in json file
    """
    with open(path, "w") as f:
        json.dump(data, f, indent=4)

    logger.info(f"json file saved at: {path}")

@ensure_annotations
def load_json(path: Path) -> ConfigBox:
    """load json files data

    Args:
        path (Path): path to json file

    Returns:
        ConfigBox: data as class attributes instead of dict
    """
    with open(path) as f:
        content = json.load(f)

    logger.info(f"json file loaded succesfully from: {path}")
    return ConfigBox(content)

@ensure_annotations
def save_bin(data: Any, path: Path):
    """save binary file

    Args:
        data (Any): data to be saved as binary
        path (Path): path to binary file
    """
    joblib.dump(value=data, filename=path)
    logger.info(f"binary file saved at: {path}")

@ensure_annotations
def load_bin(path: Path) -> Any:
    """load binary data

    Args:
        path (Path): path to binary file

    Returns:
        Any: object stored in the file
    """
    data = joblib.load(path)
    logger.info(f"binary file loaded from: {path}")
    return data

@ensure_annotations
def get_size(path: Path) -> str:
    """get size in KB

    Args:
        path (Path): path of the file

    Returns:
        str: size in KB
    """
    size_in_kb = round(os.path.getsize(path)/1024)
    return f"~ {size_in_kb} KB"


In [14]:
from pathlib import Path

CONFIG_FILE_PATH = Path("configs/config.yaml")
SECRETS_FILE_PATH = Path("configs/secrets.yaml")
PARAMS_FILE_PATH = Path("params.yaml")

In [13]:
## entity - 
from dataclasses import dataclass

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

@dataclass(frozen=True)
class PrepareBaseModelConfig:
    root_dir: Path
    base_model_filepath: Path
    updated_base_model_path: Path
    param_image_size: list
    param_classes: int
    param_learning_rate: float
    param_include_top: bool

@dataclass(frozen=True)
class PrepareCallbacksConfig:
    root_dir: Path
    tensorboard_root_log_dir: Path
    checkpoint_model_filepath: Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    updated_base_model_path: Path
    training_data: Path
    params_epochs: int
    params_batch_size: int
    params_is_augmentating: bool
    params_image_size: list
    trained_model_path: Path


In [None]:
import os

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath=CONFIG_FILE_PATH, 
        params_filepath=PARAMS_FILE_PATH, 
        secrets_filepath=SECRETS_FILE_PATH):
        self.config = read_yaml(path_to_yaml=config_filepath)
        self.params = read_yaml(path_to_yaml=params_filepath)
        self.secrets = read_yaml(path_to_yaml=secrets_filepath)
        create_directories([self.config.artifacts_root])


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        logger.info("getting configuration for data ingestion")

        data_ingestion = self.config.data_ingestion
        create_directories([
            Path(data_ingestion.root_dir)
        ])
        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(data_ingestion.root_dir),
            source_URL=data_ingestion.source_URL,
            local_data_file=Path(data_ingestion.local_data_file),
            unzip_dir=Path(data_ingestion.unzip_dir)
        )

        return data_ingestion_config

    def get_base_model_config(self) -> PrepareBaseModelConfig:
        logger.info("getting configuration for base model preparation")

        prepare_base_model = self.config.prepare_base_model
        create_directories([
            Path(prepare_base_model.root_dir)
        ])
        prepare_base_model_config = PrepareBaseModelConfig(
            root_dir=Path(prepare_base_model.root_dir),
            base_model_filepath=Path(prepare_base_model.base_model_filepath),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            param_image_size=self.params.IMAGE_SIZE,
            param_classes=self.params.CLASSES,
            param_learning_rate=self.params.LEARNING_RATE,
            param_include_top=self.params.INCLUDE_TOP,
            param_weights=self.params.WEIGHTS
        )
        return prepare_base_model_config

    def get_callbacks_config(self) -> PrepareCallbacksConfig:
        logger.info("getting configuration for callbacks")

        prepare_callbacks = self.config.prepare_callbacks
        create_directories([
            Path(prepare_callbacks.tensorboard_root_log_dir),
            Path(os.path.dirname(prepare_callbacks.checkpoint_model_filepath))
        ])
        callbacks_config = PrepareCallbacksConfig(
            root_dir=Path(prepare_callbacks.root_dir),
            tensorboard_root_log_dir=Path(prepare_callbacks.tensorboard_root_log_dir),
            checkpoint_model_filepath=Path(prepare_callbacks.checkpoint_model_filepath)
        )
        return callbacks_config

    def get_training_config(self) -> TrainingConfig:
        logger.info("getting configuration for model training")

        training = self.config.training
        callbacks = self.config.prepare_callbacks
        updated_base_model = self.config.prepare_base_model.updated_base_model_path
        training_data = os.path.join(self.config.data_ingestion.unzip_dir, "PetImages")
        params = self.params

        create_directories([
            Path(training.root_dir)
        ])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            updated_base_model_path=Path(updated_base_model),
            training_data=Path(training_data),
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            params_is_augmentating=params.AUGMENTATION,
            params_image_size=params.IMAGE_SIZE,
            trained_model_path=Path(training.trained_model_path)
        )
        return training_config

In [8]:


from tqdm import tqdm
import os
from zipfile import ZipFile
import urllib.request as request


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        logger.info("Trying to download file...")
        if not os.path.exists(self.config.local_data_file):
            logger.info("Downloading file...")
            filename, headers = request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file
                )
            logger.info(f"{filename} downloaded! with following info: \n{headers}")
        logger.info(f"Desired file already exists of size: {get_size(self.config.local_data_file)}")

    def _get_updated_list(self, list_of_file: list) -> list:
        return [
            f for f in list_of_file \
            if f.endswith(".jpg") and \
            ("Cat" in f or "Dog" in f)
            ]
    
    def _proccessing(self, zf: ZipFile, f: str, working_dir: str):
        target_filepath = os.path.join(working_dir, f)
        if not os.path.exists(target_filepath):
            zf.extract(f, working_dir)

        if os.path.getsize(target_filepath) == 0:
            os.remove(target_filepath)
            logger.info(f"removing file: {target_filepath}") 

    def unzip_and_clean(self):
        logger.info("Unzipping file and checking for 0 size file...")
        with ZipFile(file=self.config.local_data_file, mode="r") as zf:
            list_of_file = zf.namelist()
            updated_list_of_files = self._get_updated_list(list_of_file)
            print(len(list_of_file), len(updated_list_of_files))
            
            for f in tqdm(updated_list_of_files):
                self._proccessing(zf, f, self.config.unzip_dir)

data already exists
