In [1]:
import os  # Import the os module to interact with the operating system


In [2]:
%pwd  # This is a Jupyter Notebook magic command to display the current working directory


'c:\\Users\\ayupt\\Desktop\\Data Science Projects\\End to End Deployment\\Kidney-Disease-Classificaion-End-to-End-MLflow-DVC\\research'

In [3]:
os.chdir("../")  # Change the current working directory to the parent directory


In [4]:
%pwd  # Again, display the updated working directory to confirm the change


'c:\\Users\\ayupt\\Desktop\\Data Science Projects\\End to End Deployment\\Kidney-Disease-Classificaion-End-to-End-MLflow-DVC'

In [5]:
# Import necessary modules
from dataclasses import dataclass  # Provides a decorator to create data classes
from pathlib import Path  # Provides an object-oriented way to work with file system paths

# Define a data class for data ingestion configuration
@dataclass(frozen=True)  # `frozen=True` makes the class immutable (attributes cannot be modified after creation)
class DataIngestionConfig:
    root_dir: Path  # Path to the root directory where data will be stored
    source_URL: str  # URL to download the dataset
    local_data_file: Path  # Path to the local file where downloaded data will be stored
    unzip_dir: Path  # Directory where the dataset will be extracted after unzipping


In [6]:
# Import all constants from the cnnClassifier.constants module
# This may include paths, URLs, or other predefined constant values
from cnnClassifier.constants import *  

# Import utility functions from the cnnClassifier.utils.common module
from cnnClassifier.utils.common import read_yaml, create_directories  

In [7]:
# Define a class to manage configuration settings
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,  # Default path for the configuration file
        params_filepath=PARAMS_FILE_PATH  # Default path for the parameters file
    ):
        # Read the configuration and parameters from YAML files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        # Ensure that the artifact root directory exists
        create_directories([self.config.artifacts_root])

    # Method to get data ingestion configuration
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion  # Extract the data ingestion section from the config

        # Ensure the root directory for data ingestion exists
        create_directories([config.root_dir])

        # Create a DataIngestionConfig object with necessary parameters
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config  # Return the configuration object


In [8]:
# Import necessary libraries
import os  # Provides functions to interact with the operating system
import zipfile  # Allows working with ZIP archives
import gdown  # Used to download files from Google Drive

# Import custom logger for logging messages
from cnnClassifier import logger  

# Import a utility function to get file sizes
from cnnClassifier.utils.common import get_size  


In [9]:
import os  # Importing os for file and directory operations
import zipfile  # Importing zipfile to handle zip file extraction
import gdown  # Importing gdown to download files from Google Drive
import logging  # Importing logging to log messages and errors

# Configuring the logger to track the execution process
logger = logging.getLogger(__name__)

class DataIngestion:
    """
    This class handles the process of downloading a dataset from a URL 
    and extracting it to a specified directory.
    """

    def __init__(self, config: DataIngestionConfig):
        """
        Constructor method to initialize DataIngestion with a configuration object.

        :param config: DataIngestionConfig object containing configuration parameters 
                       such as source URL, local file path, and unzip directory.
        """
        self.config = config  # Storing the config object for later use

    def download_file(self) -> str:
        """
        Downloads a dataset from the provided URL and saves it locally.

        :return: Path to the downloaded file as a string.
        """
        try:
            dataset_url = self.config.source_URL  # Extract dataset URL from config
            zip_download_dir = self.config.local_data_file  # Extract local file path from config

            # Ensure the parent directory of the download path exists
            os.makedirs(os.path.dirname(zip_download_dir), exist_ok=True)  
            
            # Logging the download start
            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            # Extracting the file ID from the Google Drive URL (assuming it's a Google Drive link)
            file_id = dataset_url.split("/")[-2]  
            
            # Constructing the Google Drive download URL (potential issue: this format may not work)
            prefix = 'https://drive.google.com/uc?/export=download&id='  
            
            # Downloading the file using gdown
            gdown.download(prefix + file_id, zip_download_dir)

            # Logging successful download
            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")

            return zip_download_dir  # Returning the path of the downloaded file

        except Exception as e:
            logger.error(f"Error in downloading file: {e}")  # Logging error if download fails
            raise e  # Raising the exception for debugging

    def extract_zip_file(self):
        """
        Extracts the downloaded zip file to the specified directory.

        :return: None
        """
        try:
            unzip_path = self.config.unzip_dir  # Get the extraction directory from config
            
            # Ensure the directory for extraction exists
            os.makedirs(unzip_path, exist_ok=True)  
            
            # Logging extraction start
            logger.info(f"Extracting files to {unzip_path}")

            # Open the zip file and extract its contents
            with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)  # Extract all files to the directory
            
            # Logging successful extraction
            logger.info(f"Extracted files to {unzip_path}")

        except zipfile.BadZipFile:
            logger.error("Failed to extract zip file. The file might be corrupted.")  # Log issue if file is corrupted
            raise  # Raise error to notify about bad zip file

        except Exception as e:
            logger.error(f"Error in extracting file: {e}")  # Logging any other extraction errors
            raise  # Raising exception for debugging


In [10]:
try:
    # Initialize the configuration manager to fetch ingestion settings
    config = ConfigurationManager()
    
    # Retrieve the data ingestion configuration settings
    data_ingestion_config = config.get_data_ingestion_config()
    
    # Initialize the DataIngestion class with the retrieved configuration
    data_ingestion = DataIngestion(config=data_ingestion_config)
    
    # Download the required file
    data_ingestion.download_file()
    
    # Extract the contents of the downloaded zip file
    data_ingestion.extract_zip_file()
    
except Exception as e:
    # Raise any exceptions encountered during the process
    raise e


[2025-02-15 12:02:56,636: INFO: common: YAML file: config\config.yaml loaded successfully]
[2025-02-15 12:02:56,641: INFO: common: YAML file: params.yaml loaded successfully]
[2025-02-15 12:02:56,644: INFO: common: Created directory at: artifacts]
[2025-02-15 12:02:56,648: INFO: common: Created directory at: artifacts/data_ingestion]
[2025-02-15 12:02:56,651: INFO: 2013711394: Downloading data from https://drive.google.com/file/d/1vlhZ5c7abUKF8xXERIw6m9Te8fW7ohw3/view?usp=sharing into file artifacts/data_ingestion/data.zip]


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1vlhZ5c7abUKF8xXERIw6m9Te8fW7ohw3
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1vlhZ5c7abUKF8xXERIw6m9Te8fW7ohw3&confirm=t&uuid=80e0e714-b106-4854-ad46-a456a10f8ed1
To: c:\Users\ayupt\Desktop\Data Science Projects\End to End Deployment\Kidney-Disease-Classificaion-End-to-End-MLflow-DVC\artifacts\data_ingestion\data.zip
100%|██████████| 57.7M/57.7M [00:11<00:00, 4.86MB/s]

[2025-02-15 12:03:13,274: INFO: 2013711394: Downloaded data from https://drive.google.com/file/d/1vlhZ5c7abUKF8xXERIw6m9Te8fW7ohw3/view?usp=sharing into file artifacts/data_ingestion/data.zip]
[2025-02-15 12:03:13,279: INFO: 2013711394: Extracting files to artifacts/data_ingestion]





[2025-02-15 12:03:14,873: INFO: 2013711394: Extracted files to artifacts/data_ingestion]
