In [13]:
import pandas as pd
import numpy as np
import os
import joblib as joblib
import pickle

from tqdm.auto import tqdm
from datetime import datetime
from scipy.stats import zscore

# from ReMASTER.system import get_data_dir

tqdm.pandas()

In [11]:
file_path = 'C:\\Users\\amirani\\OneDrive - purdue.edu\\CSDSDATA\\Desktop\\ReMASTER\\data\\NQ_5Years_8_11_2024.csv'

# Initialize an empty DataFrame for the combined data
# data = pd.DataFrame()
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to inspect its structure
data.head()

Unnamed: 0,Time,Open,High,Low,Close,Volume
0,8/11/2019 23:05,9073.25,9098.5,9073.0,9092.5,1758
1,8/11/2019 23:10,9093.25,9095.5,9089.75,9092.75,438
2,8/11/2019 23:15,9093.0,9096.25,9088.0,9089.75,590
3,8/11/2019 23:20,9090.25,9090.25,9086.0,9087.0,278
4,8/11/2019 23:25,9086.75,9088.25,9079.75,9083.75,711


In [12]:
# Step 1: Convert 'Time' column to datetime format
data['Time'] = pd.to_datetime(data['Time'], format='%m/%d/%Y %H:%M')

# Step 2: Check for any NA values and drop columns with NA values (if any)
data.dropna(axis=1, inplace=True)

# Step 3: Perform robust daily Z-score normalization on each feature dimension
# Extract date component for daily grouping
data['Date'] = data['Time'].dt.date
# Group by 'Date' and normalize 'Open', 'High', 'Low', 'Close', and 'Volume' columns
feature_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
data[feature_columns] = data.groupby('Date')[feature_columns].transform(zscore)

# Step 4: Drop 5% of the most extreme values from the 'Close' column to reduce label outliers
# Identify upper and lower 2.5% quantiles for 'Close'
q_low, q_high = data['Close'].quantile([0.025, 0.975])
data = data[(data['Close'] >= q_low) & (data['Close'] <= q_high)]

# Step 5: Drop NA rows (if any remain after filtering) and reset index
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

# Step 6: Save the cleaned data to a new CSV file
# Create the directory if it doesn't exist
save_path = 'new_data/reshaped_data.csv'
os.makedirs(os.path.dirname(save_path), exist_ok=True)
data.to_csv(save_path, index=False)

# Display first few rows of the cleaned data to confirm
data.head()

Unnamed: 0,Time,Open,High,Low,Close,Volume,Date
0,2019-08-11 23:05:00,-2.256861,1.8026,-2.130639,1.565561,2.904046,2019-08-11
1,2019-08-11 23:10:00,1.334527,1.162968,1.555467,1.634445,-0.020545,2019-08-11
2,2019-08-11 23:15:00,1.289635,1.322876,1.170351,0.807829,0.316226,2019-08-11
3,2019-08-11 23:20:00,0.795819,0.043611,0.730219,0.050098,-0.37504,2019-08-11
4,2019-08-11 23:25:00,0.167326,-0.38281,-0.645194,-0.845403,0.584314,2019-08-11


Split data into train, valid, and test sets by datetime. Earlier years are in training, recent years in test. 

In [21]:
import pickle
from datetime import datetime
import pandas as pd

# Convert the 'Time' column to a datetime type for sorting and splitting
data['Time'] = pd.to_datetime(data['Time'])

# Sort the data by the datetime to ensure the time-based order is maintained
data = data.sort_values(by='Time')

# Define the split ratios for train (80%), validation (10%), and test (10%)
train_size = int(len(data) * 0.8)
valid_size = int(len(data) * 0.1)

# Create train, validation, and test splits based on time order
train_data = data[:train_size]
valid_data = data[train_size:train_size + valid_size]
test_data = data[train_size + valid_size:]

# Save each set as .pkl files
train_path = 'C:\\Users\\amirani\\OneDrive - purdue.edu\\CSDSDATA\\Desktop\\ReMASTER\\data\\NQ100\\NQ100_dl_train.pkl'
valid_path = 'C:\\Users\\amirani\\OneDrive - purdue.edu\\CSDSDATA\\Desktop\\ReMASTER\\data\\NQ100\\NQ100_dl_valid.pkl'
test_path = 'C:\\Users\\amirani\\OneDrive - purdue.edu\\CSDSDATA\\Desktop\\ReMASTER\\data\\NQ100\\NQ100_dl_test.pkl'

with open(train_path, 'wb') as f:
    pickle.dump(train_data, f)

with open(valid_path, 'wb') as f:
    pickle.dump(valid_data, f)

with open(test_path, 'wb') as f:
    pickle.dump(test_data, f)

train_path, valid_path, test_path

('C:\\Users\\amirani\\OneDrive - purdue.edu\\CSDSDATA\\Desktop\\ReMASTER\\data\\NQ100\\NQ100_dl_train.pkl',
 'C:\\Users\\amirani\\OneDrive - purdue.edu\\CSDSDATA\\Desktop\\ReMASTER\\data\\NQ100\\NQ100_dl_valid.pkl',
 'C:\\Users\\amirani\\OneDrive - purdue.edu\\CSDSDATA\\Desktop\\ReMASTER\\data\\NQ100\\NQ100_dl_test.pkl')

Ensure data is in the .pkl files

In [22]:
# Function to load and test if data exists in each .pkl file
def test_data_in_pkl(file_path):
    try:
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        # Check if the DataFrame is not empty
        if isinstance(data, pd.DataFrame) and not data.empty:
            print(f"Data exists in {file_path} and has {len(data)} rows and {len(data.columns)} columns.")
        else:
            print(f"File {file_path} is either empty or not a DataFrame.")
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

# Test each .pkl file
test_data_in_pkl(train_path)
test_data_in_pkl(valid_path)
test_data_in_pkl(test_path)

Data exists in C:\Users\amirani\OneDrive - purdue.edu\CSDSDATA\Desktop\ReMASTER\data\NQ100\NQ100_dl_train.pkl and has 250369 rows and 7 columns.
Data exists in C:\Users\amirani\OneDrive - purdue.edu\CSDSDATA\Desktop\ReMASTER\data\NQ100\NQ100_dl_valid.pkl and has 31296 rows and 7 columns.
Data exists in C:\Users\amirani\OneDrive - purdue.edu\CSDSDATA\Desktop\ReMASTER\data\NQ100\NQ100_dl_test.pkl and has 31297 rows and 7 columns.
