In [1]:
import os

In [2]:
os.chdir("../")
%pwd



'd:\\Aiprojects\\Textsummarization\\text-summarization'

In [3]:
from dataclasses import dataclass
from pathlib import Path 

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir : Path
    source_url : str
    local_data_file : Path
    unzip_dir : Path
    

In [4]:
from src.constants import *
from src.utils import read_yaml , create_directory

In [5]:
class ConfigurationManager:
    def __init__(self,config_filepath = CONFIG_FILE_PATH , params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directory([self.config.artifacts_root])


    def get_data_ingestion_config(self) -> DataIngestionConfig:

        config = self.config.data_ingestion

        create_directory([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir ,
            source_url = config.source_url,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )

        return data_ingestion_config

In [20]:
import os
import urllib.request as request
import zipfile 
from src.logging.logger import get_logger

import requests , zipfile , io

In [24]:
logging = get_logger(__name__)

class DataIngestion:
    def __init__(self,config : DataIngestionConfig):
        self.config = config 


    def download_data(self):
        if not os.path.exists(self.config.local_data_file):
            filename,headers = request.urlretrieve(url = self.config.source_url, filename=self.config.local_data_file)

            logging.info(f"{filename} has been downloaded ! : \n {headers}")
        else:
            logging.info("file already exists")


    def extract_zipfile(self):
        unzip_path = self.config.unzip_dir

        os.makedirs(unzip_path,exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file,'r') as zip_file:
            zip_file.extractall(unzip_path)


    def download_and_extract(self):
        unzip_path = self.config.unzip_dir
        url = self.config.source_url
        os.makedirs(unzip_path, exist_ok=True)

        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with zipfile.ZipFile(io.BytesIO(r.content)) as zip_ref:
                zip_ref.extractall(unzip_path)

            

In [25]:
from src.Exception import CustomException
import sys
try : 
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config = data_ingestion_config)
    data_ingestion.download_and_extract()
    #data_ingestion.download_data()

    #data_ingestion.extract_zipfile()


except Exception as e :
    error = CustomException(e,sys)
    logging.error(error)
    raise error

In [17]:
import requests

url = "https://github.com/Zeyadelgabbas/datasets/raw/refs/heads/main/samsumdata.zip"
response = requests.get(url)

print("Status:", response.status_code)  # should be 200
print("Content-Type:", response.headers.get("content-type"))

Status: 200
Content-Type: application/zip
