In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout
from sklearn.preprocessing import MinMaxScaler
import joblib # for saving the scaler
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
import tensorflow.lite as tflite
import random



Seed = 42
random.seed(Seed)
np.random.seed(Seed)
tf.random.set_seed(Seed)
os.environ['TF_DETERMINISTIC_OPS'] ='1'
os.environ['TF_CUDNN_DETERMINISTIC'] ='1'


def preprocess_stock_data(csv_path,date_column ='Date',close_column = 'Close',test_size = 0.2,validation_size = 0.1,time_steps=60,scaler_output_file ='/workspaces/stock_price_prediction.model/Output/scaler.pkl'):
    """Prepare stock market price data for time series modelling, with training, validation, and test split.
                          
       Args:
            csv_path (str): Path to the CSV file containing the data.
            date_column(str): Name of the date column in the CSV
            close_column(str): Name of the closing price in the CSV
            test_size(float) : proportion of dividing the dataset
            scaler_output_file(str): Path to save the fitted MinMaxScaler instance

            Return:

            X_train(numpy.ndarray): training data
            y_train (numpy.ndarray): Training labels.
            X_val (numpy.ndarray): Validation data (features).
            y_val (numpy.ndarray): Validation labels.
            X_test (numpy.ndarray): Testing data (features).
            y_test (numpy.ndarray): Testing labels.
    """
    try:
         #Load the dataset

        if not os.path.exists(csv_path):
            raise FileNotFoundError(f'CSV file not found at path:{csv_path}')
        data = pd.read_csv(csv_path, parse_dates=[date_column])
        data.sort_values(by=date_column, inplace=True)
        print(f'CSV file successfully loaded. Column:{data.columns}')
    except Exception as e:
        raise  ValueError(f"error reading CSV file or parsing date column'{date_column}':{e}")
    # Validation columns    
    if date_column not in data.columns or close_column not in data.columns:

        raise ValueError(f"Columns '{date_column}' or '{close_column}' not found in dataset. Available columns: {data.columns}")

    #Extract and scale
    close_prices = data[close_column].values.reshape(-1, 1)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_close = scaler.fit_transform(close_prices)
    # Save the scaler
    try:
    

        # Save the scaler to a file
        with open(scaler_output_file, 'wb') as f:
            pickle.dump(scaler, f)
        print(f"Scaler saved to {scaler_output_file}")
    except Exception as e:
        raise IOError(f'Error during scaling or saving scaler:{e}')
    # Create sequences of time_steps
    X, y = [], []
    try:
        for i in range(time_steps, len(scaled_close)):
            X.append(scaled_close[i - time_steps:i])
            y.append(scaled_close[i])
    except Exception as e:
        raise ValueError(f"Error creating sequences of time_steps: {e}")
    
    X, y = np.array(X), np.array(y)
    
    # Split data into train, validation, and test sets
    try:
        total_size = len(X)
        test_size_count = int(test_size * total_size)
        validation_size_count = int(validation_size * total_size)

        train_end_idx = total_size - test_size_count - validation_size_count
        val_end_idx = total_size - test_size_count

        X_train, X_val, X_test = X[:train_end_idx], X[train_end_idx:val_end_idx], X[val_end_idx:]
        y_train, y_val, y_test = y[:train_end_idx], y[train_end_idx:val_end_idx], y[val_end_idx:]

        # Adjust splits to match desired sample sizes
        train_samples = 554
        val_samples = 55
        test_samples = 84

        X_train, y_train = X[:train_samples], y[:train_samples]
        X_val, y_val = X[train_samples:train_samples + val_samples], y[train_samples:train_samples + val_samples]
        X_test, y_test = X[train_samples + val_samples:train_samples + val_samples + test_samples], y[train_samples + val_samples:train_samples + val_samples + test_samples]
    except Exception as e:
        raise ValueError(f"Error splitting data into train, validation, and test sets: {e}")
    print(f"Data split: {len(X_train)} training samples, {len(X_val)} validation samples, {len(X_test)} testing samples")
    
    return X_train, y_train, X_val, y_val, X_test, y_test


path = "/workspaces/stock_price_prediction.model/Inputs/hdfc_bank_data.csv"

X_train, y_train, X_val, y_val, X_test, y_test = preprocess_stock_data(path)










CSV file successfully loaded. Column:Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')
Scaler saved to /workspaces/stock_price_prediction.model/Output/scaler.pkl
Data split: 554 training samples, 55 validation samples, 72 testing samples
