In [1]:
from collections import namedtuple
import os
import tqdm
os.chdir("../")

## workflow for data ingestion component

### 1. update the entity

In [2]:
from collections import namedtuple
from classifier_app.constants import *
from classifier_app.utils import read_yaml,create_directories
import os
import urllib.request as request
from zipfile import ZipFile

In [3]:
DataIngestionConfig = namedtuple("DataIngestionConfig",[
                            "root_dir",
                            "soruce_url",
                            "local_data_file",
                            "unzip_dir"
])

### update the configuration class

In [4]:
class ConfigurationManager:


    def __init__(self, 
                    config_file_path = CONFIG_FILE_PATH,
                    params_filepath = PARAMS_FILE_PATH):
                    
                    self.config = read_yaml(config_file_path)
                    self.params = read_yaml(params_filepath)
                    
                    create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self)-> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
                                root_dir=config.root_dir,
                                soruce_url=config.source_url,
                                local_data_file = config.local_data_file,
                                unzip_dir=config.unzip_dir
        )
        return data_ingestion_config

### update the DataIngestion component

In [5]:
from fileinput import filename
import zipfile


class DataIngestion:

    def __init__(self, config: DataIngestionConfig):

        self.config = config
        print(config)

    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(self.config.soruce_url,
                                filename=self.config.local_data_file)
    def _get_updated_list_of_file(self,list_of_files):
            return [f for f in list_of_files if f.endswith(".jpg") and ("Dog" in f or "Cat" in f) ]

    def _preprocess(self, zf: ZipFile, f: str, working_dir: str):
        target_filepath = os.path.join(working_dir, f)
        
        ## checking for the file is present in the working directory or not
        if not os.path.exists(target_filepath):
            zf.extract(f, working_dir)

        ## checking whether the file is of size 0 KB
        if os.path.getsize(target_filepath)==0:
            os.remove(target_filepath)

    def unzip_and_clean(self, ):
        with ZipFile(file = self.config.local_data_file,mode = "r") as zf:
                list_of_files = zf.namelist()   
                updated_list_of_files = self._get_updated_list_of_file(list_of_files)

                ## preprocessing
                for f in updated_list_of_files:
                    self._preprocess(zf, f,self.config.unzip_dir)

### update the pipeline (here i am just checking the data ingestion compnent)

In [6]:
config = ConfigurationManager()
data_ingestion_config = config.get_data_ingestion_config()
data_ingestion = DataIngestion(config=data_ingestion_config)
print(data_ingestion)
data_ingestion.download_file()
data_ingestion.unzip_and_clean()

{'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_url': 'https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}}
{'key': 'value'}
[2022-10-26 19:58:09,175: INFO: common:] Created the directory at : artifacts
[2022-10-26 19:58:09,178: INFO: common:] Created the directory at : artifacts/data_ingestion
DataIngestionConfig(root_dir='artifacts/data_ingestion', soruce_url='https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip', local_data_file='artifacts/data_ingestion/data.zip', unzip_dir='artifacts/data_ingestion')
<__main__.DataIngestion object at 0x000001DB32F674F0>


In [9]:
"""here we can see the zf.namelist() fun return the list of all the files present in the zip file and then we are using this list to 
remove the exclude the file names whose names not endswith .jpg and names dont contain the Dog or Cat in their names"""

with ZipFile("artifacts\data_ingestion\data.zip",mode = "r") as zf:
    print(zf.namelist()[:50])

['PetImages/Cat/', 'PetImages/Cat/0.jpg', 'PetImages/Cat/1.jpg', 'PetImages/Cat/10.jpg', 'PetImages/Cat/100.jpg', 'PetImages/Cat/1000.jpg', 'PetImages/Cat/10000.jpg', 'PetImages/Cat/10001.jpg', 'PetImages/Cat/10002.jpg', 'PetImages/Cat/10003.jpg', 'PetImages/Cat/10004.jpg', 'PetImages/Cat/10005.jpg', 'PetImages/Cat/10006.jpg', 'PetImages/Cat/10007.jpg', 'PetImages/Cat/10008.jpg', 'PetImages/Cat/10009.jpg', 'PetImages/Cat/1001.jpg', 'PetImages/Cat/10010.jpg', 'PetImages/Cat/10011.jpg', 'PetImages/Cat/10012.jpg', 'PetImages/Cat/10013.jpg', 'PetImages/Cat/10014.jpg', 'PetImages/Cat/10015.jpg', 'PetImages/Cat/10016.jpg', 'PetImages/Cat/10017.jpg', 'PetImages/Cat/10018.jpg', 'PetImages/Cat/10019.jpg', 'PetImages/Cat/1002.jpg', 'PetImages/Cat/10020.jpg', 'PetImages/Cat/10021.jpg', 'PetImages/Cat/10022.jpg', 'PetImages/Cat/10023.jpg', 'PetImages/Cat/10024.jpg', 'PetImages/Cat/10025.jpg', 'PetImages/Cat/10026.jpg', 'PetImages/Cat/10027.jpg', 'PetImages/Cat/10028.jpg', 'PetImages/Cat/10029.jpg'