## 1. Data Ingestion

In [1]:
# Import necessary libraries.
import os

In [2]:
%pwd

'd:\\ML Projects\\bird_type_classification_project\\research'

In [3]:
# Move outside of notebook folder to project parent folder i.e. bird_type_classification_project
os.chdir('../')

In [4]:
# Check the current folder path.
%pwd

'd:\\ML Projects\\bird_type_classification_project'

In [5]:
# Code for entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    """This class will help in reading configurations from config.yaml file for data ingestion step."""
    root_dir: Path
    source_url: str
    local_data_file: Path
    unzip_dir: Path


In [18]:
# Code for configuration 
from cnnClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from cnnClassifier.utils.common import read_yaml, create_directories

class ConfigurationManager:

    def __init__(self, 
                 config_file_path = CONFIG_FILE_PATH, 
                 params_file_path = PARAMS_FILE_PATH) -> None:
        
        self.config = read_yaml(config_file_path)
        # self.params = read_yaml(params_file_path)
        # self.secrets = read_yaml(secrets_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """Encasulates data ingestion configurations from config.yaml file and return as DataIngestionConfig object."""
        data_ingestion_config = self.config.data_ingestion

        # Create data ingestion folder inside artifacts.
        create_directories([data_ingestion_config.root_dir])

        # Encasulates the configurations and return it as DataIngestionConfig object.
        return DataIngestionConfig(root_dir=data_ingestion_config.root_dir,
                            source_URL=data_ingestion_config.source_url,
                            local_data_file=data_ingestion_config.local_data_file,
                            unzip_dir=data_ingestion_config.unzip_dir)
    

In [16]:
create_directories(['.kaggle'])

[18-Nov-23 15:29:11: INFO: common: Created directory at: .kaggle]


In [62]:
# Code for component
os.environ['KAGGLE_CONFIG_DIR'] = os.path.join(os.getcwd(), '.kaggle')
import kaggle
from cnnClassifier import logger

class DataIngestion:
    
    def __init__(self, ingestion_config: DataIngestionConfig):
        self.ingestion_config = ingestion_config
    
    def download_data(self):
        """Downloads the data from Kaggle"""

        # Authentication using kaggle.json
        kaggle.api.authenticate()
        logger.info("Kaggle Authentication successful.")

        # Download the dataset.
        logger.info("Downloading data from kaggle...")
        kaggle.api.dataset_download_files(self.ingestion_config.source_url, 
                                          self.ingestion_config.unzip_dir,
                                          unzip=True)
        logger.info('%s dataset downloaded and saved at %s', 
                    self.ingestion_config.source_URL, self.ingestion_config.root_dir)
    

In [63]:
# Code for Pipeline
config = ConfigurationManager()
data_ingestion_config = config.get_data_ingestion_config()
data_ingestion = DataIngestion(ingestion_config=data_ingestion_config)
data_ingestion.download_data()

[18-Nov-23 17:12:35: INFO: common: config\config.yaml loaded successfully.]
[18-Nov-23 17:12:35: INFO: common: Created directory at: artifacts]
[18-Nov-23 17:12:35: INFO: common: Created directory at: artifacts/data_ingestion]
[18-Nov-23 17:12:35: INFO: 2766737115: Kaggle Authentication successful.]
[18-Nov-23 17:12:35: INFO: 2766737115: Downloading data from kaggle...]
[18-Nov-23 17:17:04: INFO: 2766737115: gpiosenka/100-bird-species dataset downloaded and saved at artifacts/data_ingestion]
