In [26]:
import os

In [27]:
# Changing to root directory.
abs_path = '/home/Akshath/PycharmProjects/Summarize4Me'
os.chdir(abs_path)
%pwd

'/home/Akshath/PycharmProjects/Summarize4Me'

In [28]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
     root_dir: Path
     source_url: str
     local_data_file: Path
     unzip_dir: Path


In [29]:
from Summarize4Me.constants import *
from Summarize4Me.utils.common import read_yaml, create_dirs

In [30]:
class ConfigManager:
     def __init__(
             self,
             config_fpath = CONFIG_FILE_PATH,
             params_fpath = PARAMS_FILE_PATH
     ):
          self.config = read_yaml(config_fpath)
          self.params = read_yaml(params_fpath)

          create_dirs([self.config.artifacts_root])

     def get_ingestion(self) -> DataIngestionConfig:
          config = self.config.data_ingestion

          create_dirs([config.root_dir])

          data_ingestion = DataIngestionConfig(
               root_dir=config.root_dir,
               source_url=config.source_url,
               local_data_file=config.local_data_file,
               unzip_dir=config.unzip_dir
          )

          return data_ingestion

In [31]:
import os
import urllib.request as request
import zipfile

from Summarize4Me.logging import logger
from Summarize4Me.utils.common import get_size

In [32]:
class DataIngestion:
     def __init__(self, config: DataIngestionConfig):
          self.config = config

     def download_file(self):
          if not os.path.exists(self.config.local_data_file):
               filename, headers = request.urlretrieve(
                    url=self.config.source_url,
                    filename=self.config.local_data_file
               )
               logger.info(f"{filename} download with following \n{headers}")
          else:
               logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")


     def extract_zip(self):
          unzip_path=self.config.unzip_dir
          os.makedirs(unzip_path, exist_ok=True)
          with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
               zip_ref.extractall(unzip_path)


In [34]:
try:
     config = ConfigManager()
     data_ingestion_config = config.get_ingestion()
     data_ingestion = DataIngestion(config=data_ingestion_config)
     data_ingestion.download_file()
     data_ingestion.extract_zip()
except Exception as e:
     raise e

[2023-06-05 22:25:26,172: INFO: common: YAML File: config/config.yaml loaded]
[2023-06-05 22:25:26,175: INFO: common: YAML File: params.yaml loaded]
[2023-06-05 22:25:26,176: INFO: common: Created dir at: artifacts]
[2023-06-05 22:25:26,177: INFO: common: Created dir at: artifacts/data_ingestion]
[2023-06-05 22:30:33,175: INFO: 1428535947: artifacts/data_ingestion/data.zip download with following 
Connection: close
Content-Length: 7903594
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "dbc016a060da18070593b83afff580c9b300f0b6ea4147a7988433e04df246ca"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 9BB6:25B2:118EDFA:1528BC0:647E137F
Accept-Ranges: bytes
Date: Mon, 05 Jun 2023 16:55:28 GMT
Via: 1.1 varnish
X-Served-By: cache-maa10238-MAA
X-Cache: MISS
X-Cache-Hits: 0
X-Timer: S1685984127.9403