In [1]:
import os
%pwd

'e:\\project_ineuron\\Air_Quality_Index_Predictor\\research'

In [2]:
os.chdir("../")
%pwd

'e:\\project_ineuron\\Air_Quality_Index_Predictor'

In [3]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path  # Root directory where data will be stored
    secure_connect_bundle: Path  # Path to the secure connect bundle for connecting to AstraDB
    username: str  # AstraDB username
    password: str  # AstraDB password
    keyspace: str  # Keyspace in AstraDB
    table_name: List[str]  # List ofTable name in AstraDB
    region_name: str # Region name in AstraDB
    output_file: Path  # Path to save the downloaded data

In [4]:
from Air_Quality_Index_Predictor.constants import *
from Air_Quality_Index_Predictor.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            secure_connect_bundle = config.secure_connect_bundle, 
            username = config.username, 
            password = config.password, 
            keyspace = config.keyspace,  
            table_name = config.table_name, 
            region_name = config.region_name,
            output_file = config.output_file  
        )

        return data_ingestion_config

In [6]:
from cassandra.cluster import Cluster
import pandas as pd
from cassandra.auth import PlainTextAuthProvider
from Air_Quality_Index_Predictor.logging import logger


In [7]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    def download_data(self):
        # Create a connection to the AstraDB cluster
        protocol_version = 4
        cloud_config = {'secure_connect_bundle': str(self.config.secure_connect_bundle)}
        auth_provider = PlainTextAuthProvider(username=self.config.username, password=self.config.password)
        cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider,protocol_version=protocol_version)
        session = cluster.connect(self.config.keyspace)
        session.default_timeout = 5000

        try:
            query = f"SELECT * FROM {self.config.table_name}"
            result_set = session.execute(query)
            df = pd.DataFrame(list(result_set))

            df.to_csv(self.config.output_file, index=False)
            logger.info(f"Data downloaded and saved to {self.config.output_file}")
                

        except Exception as e:
            logger.error(f"Error downloading data: {e}")
            raise e

        finally:
            # Close the session and cluster
            session.shutdown()
            cluster.shutdown()


In [10]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_data()
except Exception as e:
    raise e

[2024-05-05 16:47:36,917 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2024-05-05 16:47:36,922 : INFO : common : yaml file: params.yaml loaded successfully]
[2024-05-05 16:47:36,948 : INFO : common : Created directory at: artifacts]
[2024-05-05 16:47:36,958 : INFO : common : Created directory at: artifacts/data_ingestion]
[2024-05-05 16:47:39,900 : INFO : policies : Using datacenter 'eu-west-1' for DCAwareRoundRobinPolicy (via host '1719142e-d68a-4d8a-96b1-b28dda57b8a7-eu-west-1.db.astra.datastax.com:29042:7cf4a33d-de22-449c-af7e-d307eaf194b4'); if incorrect, please specify a local_dc to the constructor, or limit contact points to local cluster nodes]
[2024-05-05 16:54:12,140 : INFO : 244541548 : Data downloaded and saved to artifacts/data_ingestion/data.csv]
