In [1]:
import pandas as pd
from decimal import Decimal, getcontext

# Ensure sufficient precision for Decimal calculations
getcontext().prec = 20

In [2]:
# https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data
# https://github.com/ff137/bitstamp-btcusd-minute-data/tree/main

df = pd.read_csv('btcusd_1-min_data.csv')

# format timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.set_index('timestamp', inplace=True)

print(df.shape)
df

(6846600, 5)


Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-01 10:01:00,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:02:00,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:03:00,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:04:00,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:05:00,4.58,4.58,4.58,4.58,0.000000
...,...,...,...,...,...
2025-01-06 23:56:00,102231.00,102231.00,102227.00,102227.00,0.068900
2025-01-06 23:57:00,102230.00,102232.00,102230.00,102232.00,0.199451
2025-01-06 23:58:00,102262.00,102280.00,102260.00,102280.00,0.104410
2025-01-06 23:59:00,102280.00,102280.00,102280.00,102280.00,0.007554


In [3]:
df.dtypes

open      float64
high      float64
low       float64
close     float64
volume    float64
dtype: object

### 5-min price data

In [4]:
df_5min = df.copy()

df_5min = df_5min.resample('5min').agg({
  'open': 'first',
  'high': 'max',
  'low': 'min',
  'close': 'last',
  'volume': 'sum'
})

# use Decial to avoid float precision issues
df_5min['close_Decimal'] = df_5min['close'].apply(Decimal)

# calculate return (forwarded by one row)
df_5min['return_forward'] = df_5min['close_Decimal'].pct_change().shift(-1)

# drop the tempory column close_Decimal
df_5min.drop(columns=['close_Decimal'], inplace=True)

# last row does not have return_forward
df_5min = df_5min.iloc[:-1]

# save as parquet
df_5min.to_parquet('btcusd_5min_price.parquet')

print(df_5min.shape)
df_5min

(1369320, 6)


Unnamed: 0_level_0,open,high,low,close,volume,return_forward
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-01 10:00:00,4.58,4.58,4.58,4.58,0.000000,0
2012-01-01 10:05:00,4.58,4.58,4.58,4.58,0.000000,0
2012-01-01 10:10:00,4.58,4.58,4.58,4.58,0.000000,0
2012-01-01 10:15:00,4.58,4.58,4.58,4.58,0.000000,0
2012-01-01 10:20:00,4.58,4.58,4.58,4.58,0.000000,0
...,...,...,...,...,...,...
2025-01-06 23:35:00,102083.00,102161.00,102083.00,102150.00,0.889588,0.0007342143906020558
2025-01-06 23:40:00,102149.00,102259.00,102119.00,102225.00,0.360514,0.0000978234287111763
2025-01-06 23:45:00,102223.00,102262.00,102201.00,102235.00,0.474493,-0.00007825108817919499
2025-01-06 23:50:00,102237.00,102246.00,102213.00,102227.00,0.796058,0.0005184540287790897


In [5]:
df_5min.dtypes

open              float64
high              float64
low               float64
close             float64
volume            float64
return_forward     object
dtype: object

In [6]:
# print rows per year
print(df_5min.resample('YE').size())

timestamp
2012-12-31    105288
2013-12-31    105120
2014-12-31    105120
2015-12-31    105120
2016-12-31    105408
2017-12-31    105120
2018-12-31    105120
2019-12-31    105120
2020-12-31    105408
2021-12-31    105120
2022-12-31    105120
2023-12-31    105120
2024-12-31    105408
2025-12-31      1728
Freq: YE-DEC, dtype: int64


### 15-min price data

In [7]:
df_15min = df.copy()

df_15min = df_15min.resample('15min').agg({
  'open': 'first',
  'high': 'max',
  'low': 'min',
  'close': 'last',
  'volume': 'sum'
})

# use Decial to avoid float precision issues
df_15min['close_Decimal'] = df_15min['close'].apply(Decimal)

# calculate return (forwarded by one row)
df_15min['return_forward'] = df_15min['close_Decimal'].pct_change().shift(-1)

# drop the tempory column close_Decimal
df_15min.drop(columns=['close_Decimal'], inplace=True)

# last row does not have return_forward
df_15min = df_15min.iloc[:-1]

# save as parquet
df_15min.to_parquet('btcusd_15min_price.parquet')

print(df_5min.shape)
df_15min

(1369320, 6)


Unnamed: 0_level_0,open,high,low,close,volume,return_forward
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-01 10:00:00,4.58,4.58,4.58,4.58,0.000000,0
2012-01-01 10:15:00,4.58,4.58,4.58,4.58,0.000000,0
2012-01-01 10:30:00,4.58,4.58,4.58,4.58,0.000000,0
2012-01-01 10:45:00,4.58,4.58,4.58,4.58,0.000000,0
2012-01-01 11:00:00,4.58,4.58,4.58,4.58,0.000000,0
...,...,...,...,...,...,...
2025-01-06 22:45:00,102012.00,102118.00,102012.00,102038.00,2.078417,0.0001470040573119818
2025-01-06 23:00:00,102041.00,102208.00,101932.00,102053.00,3.599929,0.0007937052315953475
2025-01-06 23:15:00,102052.00,102161.00,102052.00,102134.00,1.913189,0.0008909863512640257
2025-01-06 23:30:00,102138.00,102259.00,102021.00,102225.00,3.673151,0.0005380288579114698


### hourly price data

In [8]:
df_hour = df.copy()

df_hour = df_hour.resample('h').agg({
  'open': 'first',
  'high': 'max',
  'low': 'min',
  'close': 'last',
  'volume': 'sum'
})

# use Decial to avoid float precision issues
df_hour['close_Decimal'] = df_hour['close'].apply(Decimal)

# calculate return (forwarded by one row)
df_hour['return_forward'] = df_hour['close_Decimal'].pct_change().shift(-1)

# drop the tempory column close_Decimal
df_hour.drop(columns=['close_Decimal'], inplace=True)

# last row does not have return_forward
df_hour = df_hour.iloc[:-1]

# df_hour = df_hour[df_hour.index.year < 2025]

# save as parquet
df_hour.to_parquet('btcusd_hourly_price.parquet')

print(df_hour.shape)
df_hour.head()

(114110, 6)


Unnamed: 0_level_0,open,high,low,close,volume,return_forward
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-01 10:00:00,4.58,4.58,4.58,4.58,0.0,0
2012-01-01 11:00:00,4.58,4.58,4.58,4.58,0.0,0
2012-01-01 12:00:00,4.58,4.58,4.58,4.58,0.0,0
2012-01-01 13:00:00,4.58,4.58,4.58,4.58,0.0,0
2012-01-01 14:00:00,4.58,4.58,4.58,4.58,0.0,0


### daily price data

In [9]:
df_day = df.copy()

df_day = df_day.resample('1D').agg({
  'open': 'first',
  'high': 'max',
  'low': 'min',
  'close': 'last',
  'volume': 'sum'
})

# use Decial to avoid float precision issues
df_day['close_Decimal'] = df_day['close'].apply(Decimal)

# calculate return (forwarded by one row)
df_day['return_forward'] = df_day['close_Decimal'].pct_change().shift(-1)

# drop the tempory column close_Decimal
df_day.drop(columns=['close_Decimal'], inplace=True)

# last row does not have return_forward
df_day = df_day.iloc[:-1]

df_day = df_day[df_day.index.year < 2025]

# save as parquet
df_day.to_parquet('btcusd_daily_price.parquet')

print(df_day.shape)
df_day

(4749, 6)


Unnamed: 0_level_0,open,high,low,close,volume,return_forward
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-01,4.58,4.84,4.58,4.84,10.000000,0.0330578512396694518
2012-01-02,4.84,5.00,4.84,5.00,10.100000,0.0580000000000000071
2012-01-03,5.00,5.32,5.00,5.29,107.085281,0.0529300567107750939
2012-01-04,5.29,5.57,4.93,5.57,107.233260,0.1526032315978455299
2012-01-05,5.57,6.46,5.57,6.42,70.328742,-0.00311526479750772179
...,...,...,...,...,...,...
2024-12-27,95666.00,97337.00,93282.00,94168.00,2416.242322,0.0103113584232435647
2024-12-28,94159.00,95529.00,94026.00,95139.00,839.505181,-0.01662830174796876150
2024-12-29,95139.00,95139.00,92868.00,93557.00,851.805549,-0.00992977543102066120
2024-12-30,93564.00,94891.00,91315.00,92628.00,1841.043084,0.0081292913589843244
