In [1]:
import os
from pathlib import Path
import tensorflow as tf

In [2]:
%%writefile logger.py
import os
import sys
import logging

logging_str = "[%(asctime)s: %(levelname)s: %(module)s]: %(message)s"
log_dir = "logs"
log_filepath = os.path.join(log_dir, 'running_logs.log')
os.makedirs(log_dir, exist_ok=True)

logging.basicConfig(
    level=logging.INFO, format=logging_str,
    handlers=[
        logging.FileHandler(log_filepath),#, mode="a"),
        # logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger("app")

Overwriting logger.py


In [3]:
%%writefile utils.py

import os
from box.exceptions import BoxValueError
import yaml
import json
import joblib
from ensure import ensure_annotations
from box import ConfigBox
from pathlib import Path
from typing import Any

from logger import logger

@ensure_annotations
def read_yaml(path_to_yaml: Path) -> ConfigBox:
    """reads yaml file and returns

    Args:
        path_to_yaml (str): path like input

    Raises:
        ValueError: if yaml file is empty
        e: empty file

    Returns:
        ConfigBox: ConfigBox type
    """
    try:
        with open(path_to_yaml) as yaml_file:
            content = yaml.safe_load(yaml_file)
            logger.info(f"yaml file: {path_to_yaml} loaded successfully")
            return ConfigBox(content)
    except BoxValueError:
        raise ValueError("yaml file is empty")
    except Exception as e:
        raise e

@ensure_annotations
def create_directories(path_to_directories: list, verbose=True):
    """create list of directories

    Args:
        path_to_directories (list): list of path of directories
        ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
    """
    for path in path_to_directories:
        os.makedirs(path, exist_ok=True)
        if verbose:
            logger.info(f"created directory at: {path}")

@ensure_annotations
def save_json(path: Path, data: dict):
    """save json data

    Args:
        path (Path): path to json file
        data (dict): data to be saved in json file
    """
    with open(path, "w") as f:
        json.dump(data, f, indent=4)

    logger.info(f"json file saved at: {path}")

@ensure_annotations
def load_json(path: Path) -> ConfigBox:
    """load json files data

    Args:
        path (Path): path to json file

    Returns:
        ConfigBox: data as class attributes instead of dict
    """
    with open(path) as f:
        content = json.load(f)

    logger.info(f"json file loaded succesfully from: {path}")
    return ConfigBox(content)

@ensure_annotations
def save_bin(data: Any, path: Path):
    """save binary file

    Args:
        data (Any): data to be saved as binary
        path (Path): path to binary file
    """
    joblib.dump(value=data, filename=path)
    logger.info(f"binary file saved at: {path}")

@ensure_annotations
def load_bin(path: Path) -> Any:
    """load binary data

    Args:
        path (Path): path to binary file

    Returns:
        Any: object stored in the file
    """
    data = joblib.load(path)
    logger.info(f"binary file loaded from: {path}")
    return data

@ensure_annotations
def get_size(path: Path) -> str:
    """get size in KB

    Args:
        path (Path): path of the file

    Returns:
        str: size in KB
    """
    size_in_kb = round(os.path.getsize(path)/1024)
    return f"~ {size_in_kb} KB"


Overwriting utils.py


In [7]:
## Configs
from utils import create_directories

local_data_file = Path("artifacts/data_ingestion/data.zip")
source_URL = "https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip"
unzip_dir, zipfile_name = os.path.split(local_data_file)
create_directories([Path(unzip_dir)])

from zipfile import ZipFile
import urllib.request as request
from logger import logger
from utils import get_size
from tqdm import tqdm

try:
    logger.info("Trying to download file...")
    if not os.path.exists(local_data_file):
        logger.info("Downloading file...")
        filename, headers = request.urlretrieve(
            url=source_URL,
            filename=local_data_file
            )
        logger.info(f"{filename} downloaded! with following info: \n{headers}")
    logger.info(f"Desired file already exists of size: {get_size(local_data_file)}")
except Exception as e:
    raise e

def _get_updated_list(list_of_file: list) -> list:
    return [
        f for f in list_of_file \
        if f.endswith(".jpg") and \
        ("Cat" in f or "Dog" in f)
        ]

def _proccessing(zf: ZipFile, f: str, working_dir: str):
    target_filepath = os.path.join(working_dir, f)
    if not os.path.exists(target_filepath):
        zf.extract(f, working_dir)

    if os.path.getsize(target_filepath) == 0:
        os.remove(target_filepath)
        logger.info(f"removing file: {target_filepath}") 


logger.info("Unzipping file and checking for 0 size file...")
with ZipFile(file=local_data_file, mode="r") as zf:
    list_of_file = zf.namelist()
    updated_list_of_files = _get_updated_list(list_of_file)
    print(len(list_of_file), len(updated_list_of_files))

    for f in tqdm(updated_list_of_files):
        _proccessing(zf, f, unzip_dir)

25006 25000


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:28<00:00, 883.08it/s]


In [None]:
from pathlib import Path

base_model_filepath = 
updated_base_model_path = 
param_image_size
param_classes
param_learning_rate
param_include_top
param_weights

from utils import create_directories
from logger import logger
import io

def save_model(path: Path, model: tf.keras.Model):
    model.save(path)
    logger.info(f"model saved at: {path}")

def get_base_model(model_name: str="VGG16"):
    logger.info(f"creating base model for transfer learning...")
    self.model = tf.keras.applications.vgg16.VGG16(
        input_shape=self.config.param_image_size,
        weights=self.config.param_weights,
        include_top=self.config.param_include_top
    )
    model_path = self.config.base_model_filepath

    self.save_model(path=model_path, model=self.model)
    logger.info(f"base model: {model_name} saved!")


@staticmethod
def _prepare_full_model(model, classes, freeze_all, freeze_till, learning_rate):
    if freeze_all:
        logger.info("freeze all the layers of base CNN layer")
        for layer in model.layers:
            layer.trainable = False
    elif (freeze_till is not None) and (freeze_till > 0):
        logger.info(f"freeze the layers of base CNN layer till {freeze_till}")
        for layer in model.layers[:-freeze_till]:
            layer.trainable = False

    ## add our fully connected layers
    flatten_in = tf.keras.layers.Flatten()(model.output)
    prediction = tf.keras.layers.Dense(
        units=classes,
        activation="softmax"
    )(flatten_in)

    full_model = tf.keras.models.Model(
        inputs = model.input,
        outputs = prediction
    )

    full_model.compile(
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate),
        loss = tf.keras.losses.CategoricalCrossentropy(),
        metrics = ["accuracy"]
    )

    logger.info("custom model is compiled and ready to be trained")
    full_model.summary()
    return full_model

@staticmethod
def _log_model_summary(full_model):
    with io.StringIO() as stream:
        full_model.summary(print_fn=lambda x: stream.write(f"{x}\n"))
        summary_str = stream.getvalue()
    return summary_str

def update_base_model(self):
    logger.info(f"creating custom model for transfer learning")
    self.full_model = self._prepare_full_model(
        model=self.model,
        classes=self.config.param_classes,
        freeze_all=True,
        freeze_till=None,
        learning_rate=self.config.param_learning_rate
    )


    logger.info(f"full model summary: \n{self._log_model_summary(self.full_model)}")

    self.save_model(path=self.config.updated_base_model_path, model=self.full_model)
    logger.info(f"custom model saved!")

