In [None]:
import os
import zipfile
import gdown
from sklearn.model_selection import train_test_split
from cnnClassifier import logger
from cnnClassifier.utils.common import get_size
from cnnClassifier.entity.config_entity import DataIngestionConfig

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self) -> str:
        """
        Fetch data from the URL
        """
        try: 
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            os.makedirs("artifacts/data_ingestion", exist_ok=True)
            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            file_id = dataset_url.split("/")[-2]
            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(prefix + file_id, zip_download_dir)

            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")

        except Exception as e:
            raise e

    def extract_zip_file(self):
        """
        Extracts the zip file into the data directory
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

    def split_data_and_save(self, train_size=0.8, random_state=42):
        """
        Splits the extracted data into train and test sets, then saves them
        """
        try:
            # Assuming the data is already extracted into self.config.unzip_dir
            data_directory = self.config.unzip_dir
            files = os.listdir(data_directory)
            
            # Splitting the data into train and test sets
            train_files, test_files = train_test_split(files, train_size=train_size, random_state=random_state)

            # Creating directories for train and test data if they don't exist
            train_dir = os.path.join(data_directory, "train_data")
            test_dir = os.path.join(data_directory, "test_data")
            os.makedirs(train_dir, exist_ok=True)
            os.makedirs(test_dir, exist_ok=True)

            # Moving files to train and test directories
            for file in train_files:
                src = os.path.join(data_directory, file)
                dst = os.path.join(train_dir, file)
                os.replace(src, dst)
                logger.info(f"Moved {file} to {train_dir}")

            for file in test_files:
                src = os.path.join(data_directory, file)
                dst = os.path.join(test_dir, file)
                os.replace(src, dst)
                logger.info(f"Moved {file} to {test_dir}")

            logger.info("Split and saved data into train and test sets.")
        
        except Exception as e:
            raise e
