# This simple snippet is to read the crypto_data folder and combine the checkpoints plus the final 2h dataset to a single dataset

In [1]:
import os
import pandas as pd


In [2]:
folder_path = 'crypto_data'
if not os.path.exists(folder_path):
    raise FileNotFoundError(f"The folder '{folder_path}' does not exist.")
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
if not csv_files:
    raise ValueError("No CSV files found in the specified folder.")
csv_files_sorted = sorted(
    csv_files,
    key=lambda x: int(x.split('_checkpoint_')[1].split('_')[0]) if 'checkpoint' in x else float('inf')
)
combined_df = pd.DataFrame()
for file in csv_files_sorted:
    file_path = os.path.join(folder_path, file)
    try:
        temp_df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
    except Exception as e:
        print(f"Error reading file {file}: {e}")
print("Combined DataFrame:")
combined_df.to_csv('HFT_100ms_unresampled_data_combined_data.csv', index=False)

Combined DataFrame:
                 timestamp  bid_price  ask_price  trade_price  volume  \
0  2025-04-17 14:12:12.777    0.05309    0.07600          NaN   0.000   
1  2025-04-17 14:12:12.877    0.07436    0.07600          NaN   0.000   
2  2025-04-17 14:12:12.977    0.07436    0.07626          NaN   0.000   
3  2025-04-17 14:12:13.077    0.07218    0.07630          NaN   0.000   
4  2025-04-17 14:12:13.177    0.07218    0.07630          NaN   0.000   
5  2025-04-17 14:12:13.277    0.07218    0.07657        0.076   6.916   
6  2025-04-17 14:12:13.280    0.07218    0.07657        0.076   6.916   
7  2025-04-17 14:12:13.377    0.07218    0.07600        0.076   0.000   
8  2025-04-17 14:12:13.477    0.07107    0.07600        0.076   0.000   
9  2025-04-17 14:12:13.577    0.07379    0.07630        0.076   0.000   

   mid_price  
0   0.064545  
1   0.064545  
2   0.064545  
3   0.074240  
4   0.074240  
5   0.074240  
6   0.074240  
7   0.074240  
8   0.074240  
9   0.075045  


In [3]:
combined_df.tail(10)

Unnamed: 0,timestamp,bid_price,ask_price,trade_price,volume,mid_price
93922,2025-04-17 14:11:45.377,0.07557,0.0758,0.0758,0.0,0.075685
93923,2025-04-17 14:11:45.477,0.07055,0.0758,0.0758,0.0,0.075685
93924,2025-04-17 14:11:45.577,0.07488,0.07584,0.0758,0.0,0.07536
93925,2025-04-17 14:11:45.677,0.07523,0.07587,0.0758,0.0,0.07555
93926,2025-04-17 14:11:45.777,0.07523,0.07587,0.0758,0.0,0.07555
93927,2025-04-17 14:11:45.778,0.0756,0.0759,0.0758,0.0,0.07575
93928,2025-04-17 14:11:45.877,0.07524,0.0759,0.0758,0.0,0.07575
93929,2025-04-17 14:11:45.977,0.07524,0.0759,0.0758,0.0,0.07575
93930,2025-04-17 14:11:46.077,0.07524,0.07583,0.0758,0.0,0.07575
93931,2025-04-17 14:11:46.177,0.07229,0.07587,0.0758,0.0,0.07408


# Resampling to 1s for rapid check of a trend at a particular duration ie 2 hours


In [4]:
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'])
combined_df = combined_df.set_index('timestamp')
resampled_df = combined_df.resample('1s').agg({
    'bid_price': 'last',      # Last bid price in the second
    'ask_price': 'last',      # Last ask price in the second
    'trade_price': 'last',    # Last trade price in the second
    'volume': 'sum',          # Sum of volume within the second
    'mid_price': 'last'       # Last mid price in the second
})

resampled_df = resampled_df.reset_index()

# Show the result
print(resampled_df.head())

# Check the reduction in rows
print(f"Original rows: {len(combined_df)}")
print(f"Resampled rows: {len(resampled_df)}")

            timestamp  bid_price  ask_price  trade_price    volume  mid_price
0 2025-04-17 14:11:25    0.07537    0.07566      0.07547  11.32050   0.075565
1 2025-04-17 14:11:26    0.07541    0.07562      0.07566  41.91564   0.075515
2 2025-04-17 14:11:27    0.07312    0.07587      0.07562  24.04716   0.074495
3 2025-04-17 14:11:28    0.07312    0.07566      0.07537  60.59236   0.075430
4 2025-04-17 14:11:29    0.07539    0.07565      0.07553  88.94352   0.075520
Original rows: 93932
Resampled rows: 7248


In [5]:
# Save the 1s data : 
resampled_df.to_csv('HFT_1_hr_combined_crypto_data_1s.csv', index=False)