<a href="https://colab.research.google.com/github/baileysmoko/Fabric/blob/main/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os

# === Load your feature folder ===
feature_folder = "/content/drive/MyDrive/token_features_generated"

files = sorted(os.listdir(feature_folder))
print("Found feature files:", files)

# Store diagnostics
diagnostics = {}

for file in files:
    if not file.endswith(".csv"):
        continue

    path = os.path.join(feature_folder, file)
    df = pd.read_csv(path)

    # Ensure timestamp as datetime index
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.set_index('timestamp', inplace=True)

    print(f"\n\n===== Checking {file} =====")
    print(df.head())

    # % NaN per column
    nan_perc = df.isna().mean().sort_values(ascending=False)
    diagnostics[file] = nan_perc

    print("\nWorst 10 columns by NaN:")
    print(nan_perc.head(10))

    # Check BTC and ETH if available
    for token in ['bitcoin', 'ethereum', 'btc', 'eth']:
        if token in df.columns:
            print(f"\nSample values for {token}:")
            print(df[token].dropna().head())
            break

print("\n\n=== SUMMARY: % NaN per dataset ===")
for file, nan_series in diagnostics.items():
    print(f"\n{file}:")



Found feature files: ['divergence_vol_price_div_30d.csv', 'divergence_vol_price_div_7d.csv', 'divergence_vol_price_div_90d.csv', 'ma_ma_200.csv', 'ma_ma_21.csv', 'ma_ma_50.csv', 'ma_ma_90.csv', 'ma_spread_ma_spread_21_200.csv', 'ma_spread_ma_spread_21_50.csv', 'ma_spread_ma_spread_21_90.csv', 'ma_spread_ma_spread_50_200.csv', 'ma_spread_ma_spread_50_90.csv', 'ma_spread_ma_spread_90_200.csv', 'momentum_mom_180d.csv', 'momentum_mom_30d.csv', 'momentum_mom_7d.csv', 'momentum_mom_90d.csv', 'realized_vol_rv_21d.csv', 'realized_vol_rv_30d.csv', 'realized_vol_rv_90d.csv']


===== Checking divergence_vol_price_div_30d.csv =====
            bitcoin  ethereum  binancecoin  ripple  solana  dogecoin  tron  \
timestamp                                                                    
2013-04-28      NaN       NaN          NaN     NaN     NaN       NaN   NaN   
2013-04-29      NaN       NaN          NaN     NaN     NaN       NaN   NaN   
2013-04-30      NaN       NaN          NaN     NaN     NaN  

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os

# ===============================
# Load base price data
# ===============================
folder = '/content/drive/MyDrive/top1000_tokens_20251008_154804'
combined_prices = pd.read_csv(os.path.join(folder, 'combined_prices_daily.csv'))

# Convert timestamp → datetime, set as index
combined_prices['timestamp'] = pd.to_datetime(combined_prices['timestamp'])
combined_prices.set_index('timestamp', inplace=True)

prices_df = combined_prices.copy()

# ===============================
# Helper functions
# ===============================
EPS = 1e-8

def rsi(df, window=14):
    delta = df.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()
    rs = avg_gain / (avg_loss + EPS)
    return 100 - (100 / (1 + rs))

def macd(df, fast=12, slow=26, signal=9):
    ema_fast = df.ewm(span=fast, adjust=False).mean()
    ema_slow = df.ewm(span=slow, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    return macd_line, signal_line

def donchian_channel(df, window=20):
    upper = df.rolling(window).max()
    lower = df.rolling(window).min()
    return upper, lower

# ===============================
# Calculate RSI for multiple windows
# ===============================
rsi_windows = [14, 21, 30]
rsi_metrics = {}
for w in rsi_windows:
    rsi_metrics[f"rsi_{w}d"] = rsi(prices_df, w)

# ===============================
# Calculate MACD efficiently (avoiding fragmentation)
# ===============================
macd_list = []
macd_signal_list = []

for col in prices_df.columns:
    macd_line, signal_line = macd(prices_df[col])
    macd_list.append(macd_line.rename(col))
    macd_signal_list.append(signal_line.rename(col))

macd_df = pd.concat(macd_list, axis=1)
macd_signal_df = pd.concat(macd_signal_list, axis=1)

# ===============================
# Calculate Donchian Channels
# ===============================
donchian_upper, donchian_lower = donchian_channel(prices_df)
donchian_features = {
    "donchian_upper": donchian_upper,
    "donchian_lower": donchian_lower
}

# ===============================
# Save all metrics as separate CSVs
# ===============================
output_folder = "/content/drive/MyDrive/token_features_generated"
os.makedirs(output_folder, exist_ok=True)

# Save RSI
for name, df in rsi_metrics.items():
    df.to_csv(os.path.join(output_folder, f"{name}.csv"))

# Save MACD
macd_df.to_csv(os.path.join(output_folder, "macd.csv"))
macd_signal_df.to_csv(os.path.join(output_folder, "macd_signal.csv"))

# Save Donchian Channels
for name, df in donchian_features.items():
    df.to_csv(os.path.join(output_folder, f"{name}.csv"))

print("✅ RSI, MACD, and Donchian Channels saved as single CSV per metric!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ RSI, MACD, and Donchian Channels saved as single CSV per metric!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import os
import numpy as np

# Folder where generated metrics are saved
generated_folder = "/content/drive/MyDrive/token_features_generated"

# Parameters for checking
MIN_NONNA_RATIO = 0.5  # At least 50% non-NaN values per column
MIN_ROWS = 365          # At least 1 year of data

# List all CSV files in the folder
files = [f for f in os.listdir(generated_folder) if f.endswith('.csv')]

results = []

for f in files:
    path = os.path.join(generated_folder, f)
    try:
        df = pd.read_csv(path, index_col=0, parse_dates=True)

        # Check 1: Index is datetime
        is_datetime_index = pd.api.types.is_datetime64_any_dtype(df.index)

        # Check 2: All columns numeric
        all_numeric = all(np.issubdtype(dtype, np.number) for dtype in df.dtypes)

        # Check 3: Enough rows
        enough_rows = df.shape[0] >= MIN_ROWS

        # Check 4: Enough non-NaN values
        nonna_ratio = df.notna().mean().min()
        enough_data = nonna_ratio >= MIN_NONNA_RATIO

        results.append({
            "file": f,
            "datetime_index": is_datetime_index,
            "all_numeric": all_numeric,
            "enough_rows": enough_rows,
            "enough_data": enough_data,
            "non_na_ratio": round(nonna_ratio, 2)
        })

    except Exception as e:
        results.append({
            "file": f,
            "error": str(e)
        })

results_df = pd.DataFrame(results)
print(results_df)

# Optional: Save summary
results_df.to_csv(os.path.join(generated_folder, "metric_quality_check.csv"), index=False)
print("✅ Metric quality check complete. Summary saved as metric_quality_check.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                 file  datetime_index  all_numeric  enough_rows  enough_data  \
0         rsi_14d.csv            True         True         True        False   
1         rsi_21d.csv            True         True         True        False   
2         rsi_30d.csv            True         True         True        False   
3            macd.csv            True         True         True        False   
4     macd_signal.csv            True         True         True        False   
5  donchian_upper.csv            True         True         True        False   
6  donchian_lower.csv            True         True         True        False   

   non_na_ratio  
0           0.0  
1           0.0  
2           0.0  
3           0.0  
4           0.0  
5           0.0  
6           0.0  
✅ Metric quality check complete. Summary saved as metric_quality_check.csv


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os

folder = '/content/drive/MyDrive/top1000_tokens_20251008_154804'

# List all files
files = os.listdir(folder)

# Print all file names
for f in files:
    print(f)


Mounted at /content/drive
selected_tokens.csv
combined_prices_daily.csv
combined_market_caps_daily.csv
combined_total_volumes_daily.csv
individual_total_volumes
individual_market_caps
individual_prices
divergence_vol_price_div_7d.csv
divergence_vol_price_div_30d.csv
divergence_vol_price_div_90d.csv
rsi_14d.csv
rsi_21d.csv
rsi_30d.csv
macd.csv
macd_signal.csv
donchian_upper.csv
donchian_lower.csv
realized_vol_30d.csv
realized_vol_21d.csv
realized_vol_90d.csv
momentum_180d.csv
momentum_90d.csv
momentum_30d.csv
momentum_7d.csv
ma_spread_90_200.csv
ma_spread_50_200.csv
ma_spread_50_90.csv
ma_spread_21_200.csv
ma_spread_21_90.csv
ma_spread_21_50.csv
ma_200.csv
ma_90.csv
ma_50.csv
ma_21.csv
stablecoin_supply_daily.csv
funding_rates_bitmex.csv
