In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
def load_data():
    data_dir = Path('../data/raw')

    # load continuous dataset
    continuous_data = pd.read_csv(data_dir / 'continuous-dataset.csv')

    # load weekly pre-dispatch forecast
    forecast_data = pd.read_csv(data_dir / 'weekly-pre-dispatch-forecast.csv')

    # load train and test dataframes
    train_data = pd.read_excel(data_dir / 'train_dataframes.xlsx', sheet_name=None)
    test_data = pd.read_excel(data_dir / 'test_dataframes.xlsx', sheet_name=None)

    return continuous_data, forecast_data, train_data, test_data

In [None]:
def summarize_data(continuous_data, forecast_data, train_data):
    """
    Summary of continuous, forecast and train dataframes
    """

    print(f"----\nContinuous data samples:\n")
    print(continuous_data.head())
    print(f"----\nContinuous data info:\n")
    print(continuous_data.info())
    
    print(f"----\nForecast data samples:\n")
    print(forecast_data.head())
    print(f"----\nForecast data info:\n")
    print(forecast_data.info())

    print(f"----\nTrain data (sheet 0) samples:\n")
    print(train_data[list(train_data.keys())[0]].head())
    print(f"----\nTrain data (sheet 0) info:\n")
    print(train_data[list(train_data.keys())[0]].info())


In [None]:
continuous_data, forecast_data, train_data, test_data = load_data()

In [None]:
summarize_data(continuous_data, forecast_data, train_data)

In [None]:
def preprocess_sheet(data, sheet_name):
    """
    Preprocess individual sheets from train or test data.
    """
    print(f"Missing values in {sheet_name}:")
    print(data.isnull().sum())
    
    # Handle missing values if any
    # data = data.fillna(method='ffill')  # Forward fill as an example
    
    # Add any additional preprocessing steps here
    # For example, you might want to add lag features or rolling statistics
    
    return data


In [None]:
def preprocess_train_test_data(train_data, test_data):
    """
    Preprocess the train and test datasets.
    """
    processed_train = {}
    processed_test = {}
    
    for sheet_name, data in train_data.items():
        print(f"Processing train sheet {sheet_name} with {len(data)} entries")
        processed_train[sheet_name] = preprocess_sheet(data, f"train_{sheet_name}")
    
    for sheet_name, data in test_data.items():
        print(f"Processing test sheet {sheet_name} with {len(data)} entries")
        processed_test[sheet_name] = preprocess_sheet(data, f"test_{sheet_name}")
    
    return processed_train, processed_test


In [None]:
_, _ = preprocess_train_test_data(train_data, test_data)