In [13]:
import os
%pwd

'e:\\project_ineuron\\Air_Quality_Index_Predictor'

In [2]:
os.chdir("../")
%pwd

'e:\\project_ineuron\\Air_Quality_Index_Predictor'

In [14]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [16]:
from Air_Quality_Index_Predictor.constants import *
from Air_Quality_Index_Predictor.utils.common import read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )

        return data_transformation_config

In [18]:
import os
from Air_Quality_Index_Predictor.utils.common import breakpoints_dict, get_subindex
from Air_Quality_Index_Predictor.logging import logger
import pandas as pd
import numpy as np 
from datetime import datetime,timedelta

In [25]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def sort_and_set_index(self, df, column_name):
        if column_name in df.columns:
            df_sorted = df.sort_values(by=column_name).set_index(column_name)
        else:
            raise KeyError(f"Column '{column_name}' does not exist in the DataFrame.")
        
        return df_sorted
    
    def rename_pollutants_columns(self, df):
        df.rename(columns={'co': 'CO', 'no2': 'NO2', 'o3': 'O3', 'pm10': 'PM10', 'pm2_5': 'PM2.5', 'so2': 'SO2'}, inplace=True)
        return df
    
    def convert_date_column(self, df, date_column, date_format):
        df[date_column] = pd.to_datetime(df[date_column], format=date_format)
        return df   

    def set_negative_to_zero(self, df, column_name):
        df[column_name] = df[column_name].apply(lambda x: max(x, 1))
        return df
    
    def round_pollutant_values(self, df, pollutant_columns):
        for column in pollutant_columns:
            if column in df.columns:
                df[column] = df[column].round(3)
            else:
                raise KeyError(f"Column '{column}' does not exist in the DataFrame.")
        
        return df
    
    def fill_missing_dates_with_median(self, df, start_date='2018-01-01', end_date='2024-05-30'):
        full_date_range = pd.date_range(start=start_date, end=end_date)
        
        # Find unique cities
        cities = df['city'].unique()

        # List to store new rows to be added
        new_rows = []

        # Loop through each city and find missing dates
        for city in cities:
            city_df = df[df['city'] == city]
            city_dates = city_df['date']
            missing_dates = full_date_range.difference(city_dates)
            
            if not missing_dates.empty:
                for missing_date in missing_dates:
                    week_start = missing_date - pd.Timedelta(days=missing_date.weekday())
                    week_end = week_start + pd.Timedelta(days=6)
                    week_data = city_df[(city_df['date'] >= week_start) & (city_df['date'] <= week_end)]
                    
                    if not week_data.empty:
                        median_values = week_data[['CO', 'NO2', 'O3', 'PM10', 'PM2.5', 'SO2']].median()
                        new_row = {
                            'city': city,
                            'date': missing_date,
                            'CO': median_values['CO'],
                            'NO2': median_values['NO2'],
                            'O3': median_values['O3'],
                            'PM10': median_values['PM10'],
                            'PM2.5': median_values['PM2.5'],
                            'SO2': median_values['SO2']
                        }
                        new_rows.append(new_row)

        # Create a DataFrame from the new rows
        new_rows_df = pd.DataFrame(new_rows)

        # Concatenate the new rows with the original DataFrame
        df = pd.concat([df, new_rows_df], ignore_index=True)

        # Ensure the DataFrame is sorted by city and date
        df = df.sort_values(by=['city', 'date']).reset_index(drop=True)

        return df
    
    def calculate_subindices(self, df, breakpoints_dict):
        subindices = {}
        for column, breakpoints in breakpoints_dict.items():
            subindices[column] = df[column].apply(lambda x: get_subindex(x, breakpoints))
        return subindices

    def calculate_AQI(self, new_df):
        new_df["AQI_calculated"] = round(new_df[["PM2.5_SubIndex", "PM10_SubIndex", "SO2_SubIndex", "NO2_SubIndex",
                                        "CO_SubIndex", "O3_SubIndex"]].max(axis=1))
        new_df.loc[new_df["PM2.5_SubIndex"] + new_df["PM10_SubIndex"] <= 0, "AQI_calculated"] = np.NaN
        return new_df
    
    
    def select_columns(self, df, columns_to_keep):
        df = df[columns_to_keep]
        return df
    
    def convert(self):
        dataset = pd.read_csv(self.config.data_path)
        logger.info("Data read successfully")

        dataset = self.sort_and_set_index(dataset, 'id')
        logger.info("Data Sorted successfully")
                    
        dataset = self.rename_pollutants_columns(dataset)

        logger.info("Data renamed successfully")

        dataset = self.convert_date_column(dataset, 'date', "%d-%m-%Y")
        logger.info("Date data adjusted successfully")
        dataset = self.set_negative_to_zero(dataset, 'O3')
        logger.info("O3 data adjusted successfully")

        dataset = self.fill_missing_dates_with_median(dataset)
        logger.info("Missing dates filled successfully")

        pollutant_columns = ['CO', 'NO2', 'O3', 'PM10', 'PM2.5', 'SO2']
        dataset = self.round_pollutant_values(dataset, pollutant_columns)
        logger.info("Date values adjusted successfully")

        subindices = self.calculate_subindices(dataset, breakpoints_dict)
        logger.info("Data subindices calculated successfully")

        for column, subindex_values in subindices.items():
            dataset[f"{column}_SubIndex"] = subindex_values
        logger.info("Data subindex added  successfully")

        dataset = self.calculate_AQI(dataset)
        logger.info("Data AQI calculated successfully")
        
        dataset = self.select_columns(dataset, ['date','city','CO', 'NO2', 'O3', 'PM10', 'PM2.5', 'SO2','AQI_calculated'])
        logger.info("Column selected successfully")

        dataset.set_index('date', inplace=True)
        dataset = dataset.sort_index(ascending=True)

        train_dataset_end = pd.Timestamp(datetime(2023, 12, 30))
        test_dataset_start = train_dataset_end + timedelta(days=1) 
        test_dataset_end = pd.Timestamp(datetime(2024, 5, 31))

        train_data = dataset.loc[:train_dataset_end]
        test_data = dataset.loc[test_dataset_start:test_dataset_end]
        
        train_data.to_csv(os.path.join(self.config.root_dir, "train_dataset.csv"))
        test_data.to_csv(os.path.join(self.config.root_dir, "test_dataset.csv"))
        logger.info("Train and Test data made successfully")


In [27]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2024-06-07 22:20:17,372 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2024-06-07 22:20:17,376 : INFO : common : yaml file: params.yaml loaded successfully]
[2024-06-07 22:20:17,380 : INFO : common : Created directory at: artifacts]
[2024-06-07 22:20:17,382 : INFO : common : Created directory at: artifacts/data_transformation]
[2024-06-07 22:20:17,472 : INFO : 4091047123 : Data read successfully]
[2024-06-07 22:20:17,482 : INFO : 4091047123 : Data Sorted successfully]
[2024-06-07 22:20:17,484 : INFO : 4091047123 : Data renamed successfully]
[2024-06-07 22:20:17,520 : INFO : 4091047123 : Date data adjusted successfully]
[2024-06-07 22:20:17,552 : INFO : 4091047123 : O3 data adjusted successfully]
[2024-06-07 22:20:24,001 : INFO : 4091047123 : Missing dates filled successfully]
[2024-06-07 22:20:24,007 : INFO : 4091047123 : Date values adjusted successfully]
[2024-06-07 22:20:24,231 : INFO : 4091047123 : Data subindices calculated successfully]
[2024-06-07 22:20:2