In [1]:
import os
%pwd

'/Users/whysocurious/Documents/MLDSAIProjects/e2e-mlops-gcp/research'

In [2]:
os.chdir("../")
%pwd

'/Users/whysocurious/Documents/MLDSAIProjects/e2e-mlops-gcp'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    color: str
    year_train: int
    month_train: int
    year_val: int
    month_val: int
    year_test: int
    month_test: int
    root_dir: Path
    source_URL: str

In [4]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        params = self.params.dataDetails

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            color=params.color,
            year_train=params.year_train,
            month_train=params.month_train,
            year_val=params.year_val,
            month_val=params.month_val,
            year_test=params.year_test,
            month_test=params.month_test,
            
            root_dir=config.root_dir,
            source_URL=config.source_URL,
        )

        return data_ingestion_config

In [5]:
import os
from mlProject import logger
from mlProject.utils.common import get_size
import pandas as pd

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
    
    def download_file(self):

        filename_train = f'{self.config.color}_tripdata_{self.config.year_train:04d}-{self.config.month_train:02d}.parquet'
        pd.read_parquet(self.config.source_URL + filename_train).to_parquet(self.config.root_dir + "/" + filename_train)
        logger.info(f"{filename_train} download! \n")

        filename_val = f'{self.config.color}_tripdata_{self.config.year_val:04d}-{self.config.month_val:02d}.parquet'
        pd.read_parquet(self.config.source_URL + filename_val).to_parquet(self.config.root_dir + "/" + filename_val)
        logger.info(f"{filename_val} download! \n")

        filename_test = f'{self.config.color}_tripdata_{self.config.year_test:04d}-{self.config.month_test:02d}.parquet'
        pd.read_parquet(self.config.source_URL + filename_test).to_parquet(self.config.root_dir + "/" + filename_test)
        logger.info(f"{filename_test} download! \n")


In [6]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    # data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2024-07-04 10:55:46,248: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-04 10:55:46,250: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-04 10:55:46,251: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-04 10:55:46,252: INFO: common: created directory at: artifacts]
[2024-07-04 10:55:46,252: INFO: common: created directory at: artifacts/data_ingestion]
[2024-07-04 10:56:11,990: INFO: 3409143912: yellow_tripdata_2023-01.parquet download! 
]
[2024-07-04 10:56:39,984: INFO: 3409143912: yellow_tripdata_2023-02.parquet download! 
]
[2024-07-04 10:57:06,419: INFO: 3409143912: yellow_tripdata_2023-03.parquet download! 
]
