## Data Ingestion

In [None]:
import os # 'os' module provides a way to interact with the operating system and perform various tasks related to file and directory operations

from collections import namedtuple


In [None]:
from dataclasses import dataclass 

"""When you use the @dataclass decorator, Python automatically generates the following methods for you:

__init__(): A constructor that initializes the attributes."""

from pathlib import Path

@dataclass(frozen=True) 

# The @dataclass(frozen=True) decorator is a modification of the standard @dataclass decorator in Python's dataclasses module. 
# When you use @dataclass(frozen=True), it adds the "frozen" behavior to the data class, making its instances immutable. 
# This means that once you create an instance of a frozen data class, you cannot modify its attributes.

class DataIngestionConfig:
    root_dir: Path
    source_URL:str
    local_data_file: Path
    unzip_dir: Path




In [None]:
from deeplearningpractice.constants import * 
# Constants which are not varied is included in the constants folder and importing when required

from deeplearningpractice.utils import read_yaml,create_directories 
# importing the read_yaml method from utils for reading the yaml file and storing in the specific variable.
# importing the create_directories from utils for creating the directories.

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath) # reading the config.yaml file with read_yaml method
        self.params = read_yaml(params_filepath) # reading the params.yaml file with read_yaml method

        create_directories([self.config.artifacts_root]) # creates artifacts folder

    
    def get_data_ingestion_config(self) -> DataIngestionConfig: 
        # creating custom return type by using the -> DataIngestionConfig which is the class created in the 2nd cell.
        
        config = self.config.data_ingestion 

        """After reading the config.yaml file with read_yaml method, data_ingestion parameters like 
           (root_dir: Path
           source_URL:str
           local_data_file: Path
           unzip_dir: Path) 
           are taken and stored in config variable"""

        create_directories([config.root_dir]) # creating the directory as data_ingestion file in the artifacts ==> root_dir: artifacts/data_ingestion

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        """ Now calling the data_ingestion parameter values like 
                    (root_dir: artifacts/data_ingestion
                    source_URL: https://github.com/entbappy/Branching-tutorial/raw/master/cat-dog-data.zip
                    local_data_file: artifacts/data_ingestion/data.zip
                    unzip_dir: artifacts/data_ingestion) 
            using the config variable. Now using dot operator and calling the parameter values and storing on the parameter variables

        """

        return data_ingestion_config # returing the data_ingestion_config parameters which are used used furthur

In [None]:
import os
import urllib.request as request

"""
import urllib.request as request is used to import the request submodule from the urllib package in Python. The urllib package is a library 
for working with URLs and performing various network-related tasks, such as making HTTP requests, fetching web content, and more.
"""

from zipfile import ZipFile

"""
from zipfile import ZipFile is used to import the ZipFile class from the zipfile module in Python. The zipfile module is part of Python's 
standard library and provides functionality for creating, reading, and manipulating ZIP archives.

The ZipFile class in the zipfile module allows you to work with ZIP archives. You can use it to create new ZIP files, extract files from existing ZIP files, 
add files to existing ZIP files, and perform various operations on the contents of ZIP archives.

"""

In [None]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config # accessing the all configuration files for data ingestion like root_dir, source_URL, local_data_file, unzip_dir

    
    def download_file(self): # downloading the files
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )

    
    def _get_updated_list_of_files(self, list_of_files):
        return [f for f in list_of_files if f.endswith(".jpg") and ("Cat" in f or "Dog" in f)] # this line will be changed based on different projects
    

    
    def _preprocess(self, zf: ZipFile, f: str, working_dir: str): # if the size of the file is 0 it will remove
        target_filepath = os.path.join(working_dir, f)
        if not os.path.exists(target_filepath):
            zf.extract(f, working_dir)
        
        if os.path.getsize(target_filepath) == 0:
            os.remove(target_filepath)

    

    
    def unzip_and_clean(self):
        with ZipFile(file=self.config.local_data_file, mode="r") as zf:
            list_of_files = zf.namelist()
            updated_list_of_files = self._get_updated_list_of_files(list_of_files)
            for f in updated_list_of_files:
                self._preprocess(zf, f, self.config.unzip_dir)


In [None]:
%pwd

In [None]:
os.chdir("../")

In [None]:
%pwd

In [None]:
try:
    config = ConfigurationManager() # calling ConfigurationManager class and storing in the config variable as an object.
    data_ingestion_config = config.get_data_ingestion_config() # Now by using the object, get_data_ingestion_config method is called and stored in data_ingestion_config as new object
    data_ingestion = DataIngestion(config=data_ingestion_config) # Now calling the DataIngestion class with all config.yaml parameter values and storing in new data_ingestion object
    data_ingestion.download_file() # Using the data_ingestion object calling the download_file method.
    data_ingestion.unzip_and_clean() # Using the data_ingestion object calling the unzip_and_clean method.
except Exception as e:
    raise e

In [None]:
%pwd

# Now update the above code in python files


In [None]:
# update in config.yaml file

artifacts_root: artifacts


data_ingestion:
root_dir: artifacts/data_ingestion
source_URL: https://github.com/entbappy/Branching-tutorial/raw/master/cat-dog-data.zip
local_data_file: artifacts/data_ingestion/data.zip
unzip_dir: artifacts/data_ingestion

In [None]:
# Update In entity/config_entity.py

from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path


In [None]:
# Update In config/configuration.py


from cnnClassifier.utils import read_yaml, create_directories
from cnnClassifier.constants import *
from pathlib import Path
import os
from cnnClassifier.entity import DataIngestionConfig
                                  

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config
    

    

In [None]:
# In entity/__init__.py file


from cnnClassifier.entity.config_entity import DataIngestionConfig

In [None]:
# In config/__init__.py file


from cnnClassifier.config.configuration import ConfigurationManager

In [None]:
# components/data_ingestion.py

import os
import urllib.request as request
from zipfile import ZipFile
from tqdm import tqdm
from pathlib import Path
from cnnClassifier.entity import DataIngestionConfig
from cnnClassifier import logger
from cnnClassifier.utils import get_size


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        logger.info("Trying to download file...")
        if not os.path.exists(self.config.local_data_file):
            logger.info("Download started...")
            filename, headers = request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")        

    def _get_updated_list_of_files(self, list_of_files):
        return [f for f in list_of_files if f.endswith(".jpg") and ("Cat" in f or "Dog" in f)]

    def _preprocess(self, zf: ZipFile, f: str, working_dir: str):
        target_filepath = os.path.join(working_dir, f)
        if not os.path.exists(target_filepath):
            zf.extract(f, working_dir)
        
        if os.path.getsize(target_filepath) == 0:
            logger.info(f"removing file:{target_filepath} of size: {get_size(Path(target_filepath))}")
            os.remove(target_filepath)

    def unzip_and_clean(self):
        logger.info(f"unzipping file and removing unawanted files")
        with ZipFile(file=self.config.local_data_file, mode="r") as zf:
            list_of_files = zf.namelist()
            updated_list_of_files = self._get_updated_list_of_files(list_of_files)
            for f in tqdm(updated_list_of_files):
                self._preprocess(zf, f, self.config.unzip_dir)

In [None]:
# pipeline/stage_01_data_ingestion.py


from cnnClassifier.config import ConfigurationManager
from cnnClassifier.components import DataIngestion
from cnnClassifier import logger

class DataIngestionTrainingPipeline:
    def __init__(self):
        pass

    def main(self):
        config = ConfigurationManager()
        data_ingestion_config = config.get_data_ingestion_config()
        data_ingestion = DataIngestion(config=data_ingestion_config)
        data_ingestion.download_file()
        data_ingestion.unzip_and_clean()

In [None]:
# In components/__init__.py

from cnnClassifier.components.data_ingestion import DataIngestion


In [None]:
# In main.py file

from cnnClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline

STAGE_NAME = "Data Ingestion stage"
try:
   logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 
   data_ingestion = DataIngestionTrainingPipeline()
   data_ingestion.main()
   logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
except Exception as e:
        logger.exception(e)
        raise e