In [1]:
import os

In [2]:
!pwd

/home/cesar/Desktop/Tools-update/projects/1.AI/Projects/MLOps/research


In [3]:
# Change current directory to one step back (on the same level as main.py)
os.chdir("../")

In [4]:
!pwd

/home/cesar/Desktop/Tools-update/projects/1.AI/Projects/MLOps


In [5]:

# Create a data class: used (for data incapsulation) to package the data and bring them wherever they are needed 
# should be under :  =======> mlopsProject/entity/entity.py
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
# Creating a ConfigurationManager class: used to packaged data into data class (created above) from constant.py <== config/file.yaml 
# should be under :  =======> mlopsProject/config/configuration.py

from mlopsProject.constants import *
from mlopsProject.utils.common import read_yaml, create_directories


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig: # Note how encapsulation type is reinforced here
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(  #Packaging happening here
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config
    
# ingestion_config = ConfigurationManager()
# print(ingestion_config.get_data_ingestion_config())  #===> return data class with data packaged



In [7]:
# Actual data ingestion: using the packaged data
# should be under :  =======> mlopsProject/components/data_ingestion.py

import os
import urllib.request as request
import zipfile
from mlopsProject import logger
from mlopsProject.utils.common import get_size


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")



    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
  

# below code will be added into a class which
# should be under :  =======> pipeline/01_data_ingestion.py  : that (stated above)class will be called under main.py
try:
    config = ConfigurationManager()                              # class object created to: package the data
    data_ingestion_config = config.get_data_ingestion_config()   # class method created to: get all packaged data
    data_ingestion = DataIngestion(config=data_ingestion_config) # class object created to: receive the packaged data so that we can use them in other methods
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2023-10-30 08:29:21,283: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-10-30 08:29:21,285: INFO: common: yaml file: config/params.yaml loaded successfully]
[2023-10-30 08:29:21,289: INFO: common: yaml file: config/schema.yaml loaded successfully]
[2023-10-30 08:29:21,289: INFO: common: created directory at: artifacts]
[2023-10-30 08:29:21,290: INFO: common: created directory at: artifacts/data_ingestion]
[2023-10-30 08:29:21,291: INFO: 1966143079: File already exists of size: ~ 23 KB]


In [None]:

         
# 1       update config/params.yaml      (a way of providing the constants to be used on each step using a yaml file: parameter used for model training) 
# 2       update config/schema.yaml      (a way of providing the constants to be used on each step using a yaml file: define the data schema)
# 3       update config/config.yaml      (a way of providing the constants to be used on each step using a yaml file: artifacts paths, file paths and more...)
         
         
# 4       update mlProject/constants/__init__()    (a way of passing the constants to be used on each step)   
# 		from pathlib import Path
# 		CONFIG_FILE_PATH = Path("config/config.yaml")
# 		PARAMS_FILE_PATH = Path("config/params.yaml")
# 		SCHEMA_FILE_PATH = Path("config/schema.yaml")
		        
         
# 5       update mlProject/entity/config_entity.py (a way of building the constants types to make sure the correct ones are returned: using @dataclass)
# 		from dataclasses import dataclass
# 		from pathlib import Path

# 		@dataclass(frozen=True)
# 		class DataIngestionConfig:
# 		    root_dir: Path
# 		    source_URL: str
# 		    local_data_file: Path
# 		    unzip_dir: Path
                        
#                                  ..........  
                        
# 6       update mlProject/config/configuration.py (a configuration_manager used to read yaml,create directories and return configurations)    
# 		from mlProject.constants import *
# 		from mlProject.utils.common import read_yaml, create_directories
# 		from mlProject.entity.config_entity import (DataIngestionConfig,
# 				                            DataValidationConfig,
# 				                            DataTransformationConfig,
# 				                            ModelTrainerConfig,
# 				                            ModelEvaluationConfig)

# 		class ConfigurationManager:                                             creating a template**************class************** to be used in pipeline
# 		    def __init__(                                                       functions here return configuration and make sure are of desired build types
# 			self,
# 			config_filepath = CONFIG_FILE_PATH,
# 			params_filepath = PARAMS_FILE_PATH,
# 			schema_filepath = SCHEMA_FILE_PATH):

# 			self.config = read_yaml(config_filepath)
# 			self.params = read_yaml(params_filepath)
# 			self.schema = read_yaml(schema_filepath)

# 			create_directories([self.config.artifacts_root])


		    
# 		    def get_data_ingestion_config(self) -> DataIngestionConfig:
# 			config = self.config.data_ingestion

# 			create_directories([config.root_dir])

# 			data_ingestion_config = DataIngestionConfig(
# 			    root_dir=config.root_dir,
# 			    source_URL=config.source_URL,
# 			    local_data_file=config.local_data_file,
# 			    unzip_dir=config.unzip_dir 
# 			)

# 			return data_ingestion_config

#                                       ..........  


# 7       update mlProject/components
# 		import os
# 		import urllib.request as request
# 		import zipfile
# 		from mlProject import logger
# 		from mlProject.utils.common import get_size
# 		from pathlib import Path
# 		from mlProject.entity.config_entity import (DataIngestionConfig)


# 		class DataIngestion:                                                    creating a template**************class************** to be used in pipeline
# 		    def __init__(self, config: DataIngestionConfig):                    functions here do the actual jobs
# 			self.config = config
		    
# 		    def download_file(self):
# 			if not os.path.exists(self.config.local_data_file):
# 			    filename, headers = request.urlretrieve(
# 				url = self.config.source_URL,
# 				filename = self.config.local_data_file
# 			    )
# 			    logger.info(f"{filename} download! with following info: \n{headers}")
# 			else:
# 			    logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")


# 		    def extract_zip_file(self):
# 			"""
# 			zip_file_path: str
# 			Extracts the zip file into the data directory
# 			Function returns None
# 			"""
# 			unzip_path = self.config.unzip_dir
# 			os.makedirs(unzip_path, exist_ok=True)
# 			with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
# 			    zip_ref.extractall(unzip_path)
           
         
# 8       update mlProject/pipelines
# 		from mlProject.config.configuration import ConfigurationManager
# 		from mlProject.components.data_ingestion import DataIngestion
# 		from mlProject import logger



# 		STAGE_NAME = "Data Ingestion stage"

# 		class DataIngestionTrainingPipeline:
# 		    def __init__(self):
# 			pass

# 		    def main(self):
# 			config = ConfigurationManager()
# 			data_ingestion_config = config.get_data_ingestion_config()
# 			data_ingestion = DataIngestion(config=data_ingestion_config)
# 			data_ingestion.download_file()
# 			data_ingestion.extract_zip_file()


		    
# 		if __name__ == '__main__':
# 		    try:
# 			logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
# 			obj = DataIngestionTrainingPipeline()
# 			obj.main()
# 			logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
# 		    except Exception as e:
# 			logger.exception(e)
# 			raise e     
			    
# 9       update the pipeline_build.py
# 		from mlProject import logger
# 		from mlProject.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
# 		from mlProject.pipeline.stage_02_data_validation import DataValidationTrainingPipeline
# 		from mlProject.pipeline.stage_03_data_transformation import DataTransformationTrainingPipeline
# 		from mlProject.pipeline.stage_04_model_trainer import ModelTrainerTrainingPipeline
# 		from mlProject.pipeline.stage_05_model_evaluation import ModelEvaluationTrainingPipeline


# 		STAGE_NAME = "Data Ingestion stage"
# 		try:
# 		   logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 
# 		   data_ingestion = DataIngestionTrainingPipeline()
# 		   data_ingestion.main()
# 		   logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
# 		except Exception as e:
# 			logger.exception(e)
# 			raise e



#        NOTE: constant and entity are called into =======> config
#              config   and  utils are called into  =======> components
# 			 components are called into =========> pipelines
# 			 pipelines  are called into =========> main.py



# 10       update the pipeline_inference.py
     
     
# ================================================================================================================================================     
     