In [None]:
# Imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the plots are rendered in the notebook
%matplotlib inline

# Load the processed data
processed_folder = '../data/processed'
train_files = [os.path.join(processed_folder, f) for f in os.listdir(processed_folder) if f.startswith('train_')]
test_files = [os.path.join(processed_folder, f) for f in os.listdir(processed_folder) if f.startswith('test_')]

# Concatenate all train files into one DataFrame
train_data = pd.concat([pd.read_csv(file) for file in train_files])

# Concatenate all test files into one DataFrame
test_data = pd.concat([pd.read_csv(file) for file in test_files])

# Example plots

# 1. Distribution of Prices
plt.figure(figsize=(10, 6))
sns.histplot(train_data[1], bins=30, kde=True)  # '1' is the column index for 'price'
plt.title('Distribution of Prices (Rounded)')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# 2. Distribution of Quantity (qty)
plt.figure(figsize=(10, 6))
sns.histplot(train_data[2], bins=30, kde=True)  # '2' is the column index for 'qty'
plt.title('Distribution of Quantity')
plt.xlabel('Quantity')
plt.ylabel('Frequency')
plt.show()

# 3. Count of Trades in Each Quote Quantity Bin
plt.figure(figsize=(10, 6))
sns.countplot(x='quote_qty_bins', data=train_data)
plt.title('Count of Trades in Each Quote Quantity Bin')
plt.xlabel('Quote Quantity Bin')
plt.ylabel('Count')
plt.show()

# 4. isBuyerMaker vs. Price (as a boxplot)
plt.figure(figsize=(10, 6))
sns.boxplot(x=train_data[5], y=train_data[1])  # '5' is 'isBuyerMaker
plt.title('Boxplot of Price by isBuyerMaker')
plt.xlabel('isBuyerMaker')
plt.ylabel('Price')
plt.show()

# 5. Correlation Heatmap (Optional if you want to check correlations)
plt.figure(figsize=(10, 6))
corr = train_data.corr()  # Calculate correlation matrix
sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
