# This notebook prepares data

In [2]:
!pip install ta

from google.colab import drive
drive.mount('/content/drive')

import os
import shutil
import kagglehub
import pandas as pd
import numpy as np
from ta.trend import MACD
from ta.volatility import BollingerBands, AverageTrueRange
from ta.momentum import RSIIndicator

# 1) Set up Google Drive project

PROJECT_DIR = "/content/drive/MyDrive/bitcoin_project"
os.makedirs(PROJECT_DIR, exist_ok=True)
print("Project folder created:", PROJECT_DIR)

folders = [
    "raw",
    "cleaned",
    "filtered",
    "splits",
    "features",
    "windows",
    "normalized",
    "models",
    "results",
]

for f in folders:
    os.makedirs(f"{PROJECT_DIR}/{f}", exist_ok=True)

print("All project folders created!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Project folder created: /content/drive/MyDrive/bitcoin_project
All project folders created!


In [2]:
#dowload dataset
path = kagglehub.dataset_download("mczielinski/bitcoin-historical-data/versions/406")
print("Downloaded to:", path)

# Find all CSV files
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
print("CSV files found:", csv_files)

# Copy ALL CSVs into project/raw (NO LOCAL FOLDER)
for f in csv_files:
    src = os.path.join(path, f)
    dst = f"{PROJECT_DIR}/raw/{f}"
    shutil.copy(src, dst)
    print(f"Saved {f} → {dst}")



Downloading from https://www.kaggle.com/api/v1/datasets/download/mczielinski/bitcoin-historical-data?dataset_version_number=406...


100%|██████████| 97.7M/97.7M [00:02<00:00, 34.4MB/s]

Extracting files...





Downloaded to: /root/.cache/kagglehub/datasets/mczielinski/bitcoin-historical-data/versions/406
CSV files found: ['btcusd_1-min_data.csv']
Saved btcusd_1-min_data.csv → /content/drive/MyDrive/bitcoin_project/raw/btcusd_1-min_data.csv


### Cleanup

In [3]:
raw_file_path = f"{PROJECT_DIR}/raw/btcusd_1-min_data.csv"
print("Loading:", raw_file_path)
df = pd.read_csv(raw_file_path)

# Remove rows with zero volume
df_clean = df[df["Volume"] != 0].copy()

clean_path = f"{PROJECT_DIR}/cleaned/btcusd_1-min_data_no_zero_volume.csv"
df_clean.to_csv(clean_path, index=False)

print("Cleaned file saved:", clean_path)

Loading: /content/drive/MyDrive/bitcoin_project/raw/btcusd_1-min_data.csv
Cleaned file saved: /content/drive/MyDrive/bitcoin_project/cleaned/btcusd_1-min_data_no_zero_volume.csv


In [4]:
# Load the cleaned (no zero-volume) file
df = pd.read_csv(clean_path)

# Use the first 30 days of data (~ 30 * 24 * 60 = 43200 rows)
WINDOW = 43200
initial = df.head(WINDOW)

# Compute early trading threshold
threshold = initial["Volume"].mean() + 2 * initial["Volume"].std()

# Find the first row where volume exceeds this threshold
start_idx = df[df["Volume"] > threshold].index[0]

# Keep only meaningful trading data
df_filtered = df.loc[start_idx:].copy()

# Save the result
filtered_path = f"{PROJECT_DIR}/filtered/btcusd_1-min_data_filtered.csv"
df_filtered.to_csv(filtered_path, index=False)

print("Volume threshold:", threshold)
print("First meaningful trading row:", start_idx)
print("Filtered file saved:", filtered_path)

Volume threshold: 130.1700641941387
First meaningful trading row: 1329
Filtered file saved: /content/drive/MyDrive/bitcoin_project/filtered/btcusd_1-min_data_filtered.csv


### Split the data

In [5]:
# Load cleaned data
df = pd.read_csv(filtered_path)

# Convert timestamp
df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="s")

# Define split boundaries
train_end = pd.Timestamp("2023-12-31 23:59:59")
val_end = pd.Timestamp("2024-06-30 23:59:59")

# Create splits
df_train = df[df["Timestamp"] <= train_end].copy()
df_val   = df[(df["Timestamp"] > train_end) & (df["Timestamp"] <= val_end)].copy()
df_test  = df[df["Timestamp"] > val_end].copy()

# Save
df_train.to_csv(f"{PROJECT_DIR}/splits/train_raw.csv", index=False)
df_val.to_csv(f"{PROJECT_DIR}/splits/val_raw.csv", index=False)
df_test.to_csv(f"{PROJECT_DIR}/splits/test_raw.csv", index=False)

print("Rows:")
print("Train:", len(df_train))
print("Val:", len(df_val))
print("Test:", len(df_test))

Rows:
Train: 5022490
Val: 258266
Test: 684903


In [6]:
def add_indicators(df):
    close = df["Close"]
    high = df["High"]
    low = df["Low"]

    # --- MACD (12, 26, 9) ---
    macd = MACD(close=close, window_fast=12, window_slow=26, window_sign=9)
    df["MACD"] = macd.macd()
    df["MACD_signal"] = macd.macd_signal()

    # --- Bollinger Band Width (20) ---
    bb = BollingerBands(close=close, window=20, window_dev=2)
    upper = bb.bollinger_hband()
    lower = bb.bollinger_lband()
    df["BB_width"] = upper - lower     # band width

    # --- RSI (14) ---
    rsi = RSIIndicator(close=close, window=14)
    df["RSI"] = rsi.rsi()

    # --- ATR (14) ---
    atr = AverageTrueRange(high=high, low=low, close=close, window=14)
    df["ATR"] = atr.average_true_range()

    return df

In [14]:
# Load splits
train_path = f"{PROJECT_DIR}/splits/train_raw.csv"
val_path   = f"{PROJECT_DIR}/splits/val_raw.csv"
test_path  = f"{PROJECT_DIR}/splits/test_raw.csv"

train = pd.read_csv(train_path)
val   = pd.read_csv(val_path)
test  = pd.read_csv(test_path)

# Add indicators
train = add_indicators(train)
val = add_indicators(val)
test = add_indicators(test)

# Drop warm-up NaNs
train = train.dropna().reset_index(drop=True)
val = val.dropna().reset_index(drop=True)
test = test.dropna().reset_index(drop=True)

# Save feature-augmented data to Drive
features_dir = f"{PROJECT_DIR}/features"
os.makedirs(features_dir, exist_ok=True)

train_features_path = f"{features_dir}/train_features.csv"
val_features_path   = f"{features_dir}/val_features.csv"
test_features_path  = f"{features_dir}/test_features.csv"

train.to_csv(train_features_path, index=False)
val.to_csv(val_features_path, index=False)
test.to_csv(test_features_path, index=False)

print("Indicators added and saved.")

NameError: name 'add_indicators' is not defined

In [8]:
# Save feature-augmented data to Drive
features_dir = f"{PROJECT_DIR}/features"
os.makedirs(features_dir, exist_ok=True)

train_features_path = f"{features_dir}/train_features.csv"
val_features_path   = f"{features_dir}/val_features.csv"
test_features_path  = f"{features_dir}/test_features.csv"


### Sliding window

In [13]:
WINDOW = 60  # length of sliding window

def prepare_dataset(path):
    df = pd.read_csv(path)

    # ----- Compute log return -----
    df["log_return"] = np.log(df["Close"].shift(-1) / df["Close"])
    df = df.dropna().reset_index(drop=True)   # drop last row without target

    # ----- Select feature columns -----
    feature_cols = [
        "Close", "High", "Low", "Volume",
        "MACD", "MACD_signal",
        "BB_width", "RSI", "ATR"
    ]

    data = df[feature_cols].values
    targets = df["log_return"].values

    X = []
    y = []

    for i in range(len(df) - WINDOW):
        X.append(data[i:i+WINDOW])
        y.append(targets[i+WINDOW])  # target = value AFTER the window

    X = np.array(X)
    y = np.array(y)

    return X, y

In [None]:
# Process splits
# Prepare train/val/test window datasets from Drive features
train_X, train_y = prepare_dataset(train_features_path)
val_X, val_y     = prepare_dataset(val_features_path)
test_X, test_y   = prepare_dataset(test_features_path)

windows_dir = f"{PROJECT_DIR}/windows"
os.makedirs(windows_dir, exist_ok=True)

np.save(f"{windows_dir}/train_X.npy", train_X)
np.save(f"{windows_dir}/train_y.npy", train_y)
np.save(f"{windows_dir}/val_X.npy", val_X)
np.save(f"{windows_dir}/val_y.npy", val_y)
np.save(f"{windows_dir}/test_X.npy", test_X)
np.save(f"{windows_dir}/test_y.npy", test_y)



print("Saved sliding windows!")
print("Train X shape:", train_X.shape)
print("Val X shape:", val_X.shape)
print("Test X shape:", test_X.shape)

### Normalization

In [12]:
norm_dir = f"{PROJECT_DIR}/normalized"
os.makedirs(norm_dir, exist_ok=True)

# Load windows from Drive
train_X = np.load(f"{windows_dir}/train_X.npy")
val_X   = np.load(f"{windows_dir}/val_X.npy")
test_X  = np.load(f"{windows_dir}/test_X.npy")

train_y = np.load(f"{windows_dir}/train_y.npy")
val_y   = np.load(f"{windows_dir}/val_y.npy")
test_y  = np.load(f"{windows_dir}/test_y.npy")

# Compute mean/std over (samples, time)
train_mean = train_X.mean(axis=(0, 1))
train_std  = train_X.std(axis=(0, 1))
train_std[train_std == 0] = 1e-8  # avoid division by zero

def normalize(X, mean, std):
    return (X - mean) / std

train_X_norm = normalize(train_X, train_mean, train_std)
val_X_norm   = normalize(val_X, train_mean, train_std)
test_X_norm  = normalize(test_X, train_mean, train_std)

# Save normalized datasets + stats to Drive
np.save(f"{norm_dir}/train_X.npy", train_X_norm)
np.save(f"{norm_dir}/val_X.npy",   val_X_norm)
np.save(f"{norm_dir}/test_X.npy",  test_X_norm)

np.save(f"{norm_dir}/train_y.npy", train_y)
np.save(f"{norm_dir}/val_y.npy",   val_y)
np.save(f"{norm_dir}/test_y.npy",  test_y)

np.save(f"{norm_dir}/train_mean.npy", train_mean)
np.save(f"{norm_dir}/train_std.npy",  train_std)

print("Normalization complete!")
print("Feature count:", train_mean.shape)
print("Normalized data saved to:", norm_dir)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/bitcoin_project/windows/train_X.npy'