In [1]:
import os

In [2]:
%pwd

'/Users/mark42/Documents/ML-Pipeline/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/mark42/Documents/ML-Pipeline'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class WorkingWithDataConfig:
    root_dir: Path
    local_data_file: Path
    train_data_loader_path: Path
    test_data_loader_path: Path

In [6]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_working_with_data_config(self) -> WorkingWithDataConfig:
        config = self.config.working_with_data

        create_directories([config.root_dir])

        working_with_data_config = WorkingWithDataConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
            train_data_loader_path=config.train_data_loader_path,
            test_data_loader_path=config.test_data_loader_path,
        )

        return working_with_data_config

In [8]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from cnnClassifier import logger

In [9]:
class WorkingWithData:
    def __init__(self, config: WorkingWithDataConfig):
        self.config = config

    def download_data(self)-> str:
        try: 
            download_dir = self.config.local_data_file
            os.makedirs("artifacts/working_with_data", exist_ok=True)
            logger.info(f"Downloading data from open datasets into {download_dir}")
            
            # Download training data from open datasets.
            training_data = datasets.FashionMNIST(
                root=download_dir,
                train=True,
                download=True,
                transform=ToTensor(),
            )
            # Download test data from open datasets.
            test_data = datasets.FashionMNIST(
                root=download_dir,
                train=False,
                download=True,
                transform=ToTensor(),
            )
            
            logger.info(f"Downloaded data from open datasets into {download_dir}")
        
        except Exception as e:
            raise e

        logger.info(f"Loading data")

        batch_size = 64
        # Create data loaders.
        train_dataloader = DataLoader(training_data, batch_size=batch_size)
        test_dataloader = DataLoader(test_data, batch_size=batch_size)
        
        for X, y in test_dataloader:
            print(f"Shape of X [N, C, H, W]: {X.shape}")
            print(f"Shape of y: {y.shape} {y.dtype}")
            break
        
        logger.info(f"Loaded data")
        self.save_data_loader(train_dataloader, self.config.train_data_loader_path)
        self.save_data_loader(test_dataloader, self.config.test_data_loader_path)

    @staticmethod
    def save_data_loader(dataloader, path=Path):
        torch.save(dataloader, path)

In [10]:
try:
    config = ConfigurationManager()
    working_with_data_config = config.get_working_with_data_config()
    working_with_data = WorkingWithData(config=working_with_data_config)
    working_with_data.download_data()
except Exception as e:
    raise e

[2024-08-12 18:58:53,506: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-08-12 18:58:53,508: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-12 18:58:53,509: INFO: common: created directory at: artifacts]
[2024-08-12 18:58:53,509: INFO: common: created directory at: artifacts/working_with_data]
[2024-08-12 18:58:53,510: INFO: 1132727256: Downloading data from open datasets into artifacts/working_with_data]
[2024-08-12 18:58:53,564: INFO: 1132727256: Downloaded data from open datasets into artifacts/working_with_data]
[2024-08-12 18:58:53,565: INFO: 1132727256: Loading data]
Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64
[2024-08-12 18:58:53,570: INFO: 1132727256: Loaded data]
