In [1]:
import os

In [2]:
%pwd

'/Users/main/Desktop/LSTM_Forecast/research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'/Users/main/Desktop/LSTM_Forecast'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from TimeSeriesForecast.constants import *
from TimeSeriesForecast.config.configuration import read_yaml,create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)-> DataTransformationConfig:
        config= self.config.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config= DataTransformationConfig(
            root_dir= config.root_dir,
            data_path= config.data_path,
        )
        return data_transformation_config
        

In [8]:
import pandas as pd

In [9]:
%pwd

'/Users/main/Desktop/LSTM_Forecast'

In [10]:
from notebooks.src.feature import TimeConfig
from notebooks.src.functions_torch import TransformLag


In [11]:
def data_transformation_pipeline(path: str , columns: list[str])->pd.DataFrame:
 
    try:
        if not os.path.exists(path):
            raise FileNotFoundError(f"File not found: {path}")
        _,file_extension= os.path.splitext(path)

        if file_extension == '.csv':
            df = pd.read_csv(path,sep=',',encoding='utf-8')
        elif file_extension in ['.xlsx','.xls']:
            df = pd.read_excel(path)
        elif file_extension == '.json':
            df = pd.read_json(path)
        else:
            raise ValueError(f"Unsupported File Extension{file_extension}")

    except FileNotFoundError:
        print(f"File not found:{path}")
    except pd.errors.EmptyDataError:
        print(f"No data: The file at the path {path} is empty")
    except pd.errors.ParserError:
        print(f"Parsing error: The data at {path} could not be parsed")
    except IOError as e:
        print(f"IO eror: {e}")
    except ValueError as e:
        print(f"Value error: {e}")
        return pd.DataFrame()

    if 'Date' not in columns and not df.empty:
        columns.append('Date')
        missing_columns= [col for col in columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing columns in the dataframe: {missing_columns}")

        df = df[columns].copy()

        df['Date'] = pd.to_datetime(df['Date'])

        df.sort_values(by='Date', inplace=True)

        df.reset_index(inplace=True, drop=True)

        df.set_index('Date', inplace=True)

        for column in columns:
            if column != 'Date' and df[column].dtype == 'object':
                df[column] = df[column].str.replace('$', '', regex=False).astype(float)
        if 'Close/Last' in columns:
            df.rename(columns={'Close/Last':'Value'},inplace=True)

        return df

In [12]:
new_data=data_transformation_pipeline(path='artifacts/data_ingestion/apple.csv',columns=['Close/Last'])
new_data

Unnamed: 0_level_0,Value
Date,Unnamed: 1_level_1
2014-05-30,22.6072
2014-06-02,22.4518
2014-06-03,22.7693
2014-06-04,23.0293
2014-06-05,23.1197
...,...
2024-05-22,190.9000
2024-05-23,186.8800
2024-05-24,189.9800
2024-05-28,189.9900


In [13]:
def transformation_data(path:Path)->pd.DataFrame:  
    df= TimeConfig.process_chain(path='artifacts/data_ingestion/apple.csv',columns=['Close/Last'])
    print(df.head())
transformation_data('artifacts/data_ingestion/apple.csv')

              Value
Date               
2014-05-30  22.6072
2014-06-02  22.4518
2014-06-03  22.7693
2014-06-04  23.0293
2014-06-05  23.1197


In [None]:
def data_transformation_pipeline(path: str , columns: list[str])->pd.DataFrame:
 
    try:
        if not os.path.exists(path):
            raise FileNotFoundError(f"File not found: {path}")
        _,file_extension= os.path.splitext(path)

        if file_extension == '.csv':
            df = pd.read_csv(path,sep=',',encoding='utf-8')
        elif file_extension in ['.xlsx','.xls']:
            df = pd.read_excel(path)
        elif file_extension == '.json':
            df = pd.read_json(path)
        else:
            raise ValueError(f"Unsupported File Extension{file_extension}")
        return df

    except FileNotFoundError:
        print(f"File not found:{path}")
    except pd.errors.EmptyDataError:
        print(f"No data: The file at the path {path} is empty")
    except pd.errors.ParserError:
        print(f"Parsing error: The data at {path} could not be parsed")
    except IOError as e:
        print(f"IO eror: {e}")
    except ValueError as e:
        print(f"Value error: {e}")
        return pd.DataFrame()


    if 'Date' not in columns:
        columns.append('Date')
    elif not df.empty:

        missing_columns= [col for col in columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing columns in the dataframe: {missing_columns}")

        df = df[columns].copy()

        df['Date'] = pd.to_datetime(df['Date'])

        df.sort_values(by='Date', inplace=True)

        df.reset_index(inplace=True, drop=True)

        df.set_index('Date', inplace=True)

        for column in columns:
            if column != 'Date' and df[column].dtype == 'object':
                df[column] = df[column].str.replace('$', '', regex=False).astype(float)
        if 'Close/Last' in columns:
            df.rename(columns={'Close/Last':'Value'},inplace=True)

        return df

In [13]:
conf= ConfigurationManager()
config=conf.get_data_transformation_config()

[2024-07-19 00:33:00,712:INFO:common:yaml file: config/config.yaml loaded successfully]
[2024-07-19 00:33:00,716:INFO:common:yaml file: params.yaml loaded successfully]
[2024-07-19 00:33:00,717:INFO:common:yaml file: schema.yaml loaded successfully]
[2024-07-19 00:33:00,719:INFO:common:created directory at: artifacts]
[2024-07-19 00:33:00,720:INFO:common:created directory at: artifacts/data_transformation]
