In [1]:
import os 

In [3]:
os.chdir("../")

In [4]:
pwd%

'c:\\Users\\amenm\\OneDrive\\Desktop\\Predecting_BTC_Price\\Bitcoin_predection_price'

In [5]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir:Path
    data_dir:Path
    dataset_name:str
    data_scaled_dir:Path
    data_final_dir:Path
    features:list[str]
    look_back : int
    forecast_horizon:int


In [6]:
from src.LSTM_BTC_Prediction.constants  import *
from src.LSTM_BTC_Prediction.utils.common import read_yaml,create_directories

In [7]:
class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH,params_filepath=PARAMS_FILE_PATH):

            self.config=read_yaml(config_filepath) 
            self.params=read_yaml(params_filepath)

            create_directories([self.config.artifacts_root])
    
    def get_data_spliting_config(self) -> DataPreprocessingConfig:
        
        config=self.config.data_preprocessing

        create_directories([config.root_dir])  

        data_preprocessing_config=DataPreprocessingConfig(
            root_dir=config.root_dir,
            dataset_name= config.dataset_name,
            features=config.features,
            data_dir=config.data_dir,
            data_scaled_dir=config.data_scaled_dir,
            data_final_dir=config.data_final_dir,
            look_back=self.params.LOOK_BACK,
            forecast_horizon=self.params.FORECAST_HORIZON
            )

        return data_preprocessing_config


In [8]:
import pandas as pd 
from src.LSTM_BTC_Prediction import logger 
from sklearn.preprocessing import MinMaxScaler
import numpy as np 
from typing import Tuple

In [9]:
class DataPreprocessing:

    def __init__(self, config: DataPreprocessingConfig):
        self.config = config

    def load_data(self):
        try:
            file_path = os.path.join(self.config.root_dir, f"{self.config.dataset_name}.csv")
            self.df = pd.read_csv(file_path)
            logger.info(f"Data loaded successfully for {self.config.dataset_name}")
        except Exception as e:
            logger.error(f"Error occurred during data loading for {self.config.dataset_name}: {e}")
            raise e

    def select_features(self):
        try:
            self.df = self.df[self.config.features]
            logger.info(f"Features selected successfully for {self.config.dataset_name}")
        except Exception as e:
            logger.error(f"Error occurred during feature selection for {self.config.dataset_name}: {e}")
            raise e

    def split_dataset(self):
        try:
            self.df_train = self.df[:int(len(self.df) * 0.6)]
            self.df_val = self.df[int(len(self.df) * 0.6):int(len(self.df) * 0.8)]
            self.df_test = self.df[int(len(self.df) * 0.8):]
            logger.info(f"Dataset split successfully for {self.config.dataset_name}")
        except Exception as e:
            logger.error(f"Error occurred during dataset splitting for {self.config.dataset_name}: {e}")
            raise e
    def transform_data(self):
        try:
            scaler_train = MinMaxScaler(feature_range=(0, 1))
            scaler_val = MinMaxScaler(feature_range=(0, 1))
            scaler_test = MinMaxScaler(feature_range=(0, 1))
            self.df_train_scaled = scaler_train.fit_transform(self.df_train)
            self.df_val_scaled = scaler_val.fit_transform(self.df_val)
            self.df_test_scaled = scaler_test.fit_transform(self.df_test)
            logger.info(f"Data transformed successfully for {self.config.dataset_name}")
        except Exception as e:
            logger.error(f"Error occurred during data transformation for {self.config.dataset_name}: {e}")
            raise e

    def dataset_generator_lstm(self, dataset: np.ndarray, look_back: int, forecast_horizon: int) -> Tuple[np.ndarray, np.ndarray]:
        dataX, dataY = [], []
        for i in range(len(dataset) - look_back - forecast_horizon + 1):
            window_size_x = dataset[i:(i + look_back), :]
            window_size_y = dataset[i + look_back:i + look_back + forecast_horizon, 0]
            dataX.append(window_size_x)
            dataY.append(window_size_y)
        return np.array(dataX), np.array(dataY) 

    def transform_generator(self):
        try:
            self.trainX,self.trainY=self.dataset_generator_lstm(dataset=self.df_train_scaled, look_back=self.config.look_back, forecast_horizon=self.config.forecast_horizon)
            self.valX,self.valY=self.dataset_generator_lstm(dataset=self.df_val_scaled, look_back=self.config.look_back, forecast_horizon=self.config.forecast_horizon)
            self.testX,self.testY=self.dataset_generator_lstm(dataset=self.df_test_scaled, look_back=self.config.look_back, forecast_horizon=self.config.forecast_horizon)
            logger.info(f"Data transformation and generation completed successfully")
        except Exception as e:
            logger.error(f"Error occurred during data transformation and generation: {e}")
            raise e

    def save_final_dataset(self): 
        try:
            os.makedirs("artifacts/data_final", exist_ok=True) 
            np.save(os.path.join(self.config.data_final_dir, "trainX.npy"), self.trainX)
            np.save(os.path.join(self.config.data_final_dir, "trainY.npy"), self.trainY)
            np.save(os.path.join(self.config.data_final_dir, "valX.npy"), self.valX)
            np.save(os.path.join(self.config.data_final_dir, "valY.npy"), self.valY)
            np.save(os.path.join(self.config.data_final_dir, "testX.npy"), self.testX)
            np.save(os.path.join(self.config.data_final_dir, "testY.npy"), self.testY)
            logger.info(f"Final datasets saved successfully at: {self.config.data_final_dir}")
        except Exception as e:
            logger.error(f"Error occurred during saving final datasets: {e}")
            raise e  

    def save_dataset_splited(self):
        try:
            os.makedirs("artifacts/data_split", exist_ok=True)
            file_path_train = os.path.join(self.config.data_dir, f"train.csv")
            self.df_train.to_csv(file_path_train)
            file_path_val = os.path.join(self.config.data_dir, f"val.csv")
            self.df_val.to_csv(file_path_val)
            file_path_test = os.path.join(self.config.data_dir, f"test.csv")
            self.df_test.to_csv(file_path_test)
            logger.info(f"Splited dataset saved successfully for {self.config.dataset_name}")
        except Exception as e:
            logger.error(f"Error occurred during saving splited dataset for {self.config.dataset_name}: {e}")
            raise e

    def save_dataset_splited_transformed(self):
        try:
            os.makedirs("artifacts/data_scaled", exist_ok=True)
            file_path_train = os.path.join(self.config.data_scaled_dir, f"train_scaled.csv")
            pd.DataFrame(self.df_train_scaled).to_csv(file_path_train)
            file_path_val = os.path.join(self.config.data_scaled_dir, f"val_scaled.csv")
            pd.DataFrame(self.df_val_scaled).to_csv(file_path_val)
            file_path_test = os.path.join(self.config.data_scaled_dir, f"test_scaled.csv")
            pd.DataFrame(self.df_test_scaled).to_csv(file_path_test)
            logger.info(f"Splited and transformed dataset saved successfully for {self.config.dataset_name}")
        except Exception as e:
            logger.error(f"Error occurred during saving splited and transformed dataset for {self.config.dataset_name}: {e}")
            raise e


In [10]:
try:
    config= ConfigurationManager()
    config=config.get_data_spliting_config()
    data_preprocessing = DataPreprocessing(config)
    data_preprocessing.load_data()
    data_preprocessing.select_features()
    data_preprocessing.split_dataset()
    data_preprocessing.transform_data()
    data_preprocessing.transform_generator()
    data_preprocessing.save_dataset_splited()
    data_preprocessing.save_dataset_splited_transformed()
    data_preprocessing.save_final_dataset()
except Exception as e:
    raise e        



[2024-05-06 15:19:03,798: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-06 15:19:03,800: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-06 15:19:03,801: INFO: common: created directory at: artifacts]
[2024-05-06 15:19:03,801: INFO: common: created directory at: artifacts/data_ingestion]
[2024-05-06 15:19:03,825: INFO: 2674098266: Data loaded successfully for BTC]
[2024-05-06 15:19:03,827: INFO: 2674098266: Features selected successfully for BTC]
[2024-05-06 15:19:03,827: INFO: 2674098266: Dataset split successfully for BTC]
[2024-05-06 15:19:03,834: INFO: 2674098266: Data transformed successfully for BTC]
[2024-05-06 15:19:03,846: INFO: 2674098266: Data transformation and generation completed successfully]
[2024-05-06 15:19:03,878: INFO: 2674098266: Splited dataset saved successfully for BTC]
[2024-05-06 15:19:03,916: INFO: 2674098266: Splited and transformed dataset saved successfully for BTC]
[2024-05-06 15:19:03,936: INFO: 2674098266: 

(1982, 1)