In [1]:
%pip install --quiet gdown pandas matplotlib seaborn scikit-learn xgboost statsmodels joblib



Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.26.4 which is incompatible.

[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import gdown

# File IDs
HYPERLIQUID_ID = '1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs'
FEARGREED_ID = '1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf'

print("Downloading Hyperliquid trades data...")
gdown.download(id=HYPERLIQUID_ID, output='hyperliquid_trades.csv', quiet=False)

print("Downloading Fear & Greed index data...")
gdown.download(id=FEARGREED_ID, output='fear_greed.csv', quiet=False)

import os

print("\nFiles in working directory:")
for f in ['hyperliquid_trades.csv', 'fear_greed.csv']:
    if os.path.exists(f):
        print(f, "-", os.path.getsize(f)//1024, "KB")
    else:
        print(f, "NOT FOUND")


Downloading Hyperliquid trades data...


Downloading...
From: https://drive.google.com/uc?id=1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs
To: c:\Users\Rohith\Downloads\ds_rohith_gonugunta\hyperliquid_trades.csv
100%|██████████| 47.5M/47.5M [00:12<00:00, 3.66MB/s]


Downloading Fear & Greed index data...


Downloading...
From: https://drive.google.com/uc?id=1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf
To: c:\Users\Rohith\Downloads\ds_rohith_gonugunta\fear_greed.csv
100%|██████████| 90.8k/90.8k [00:00<00:00, 741kB/s]


Files in working directory:
hyperliquid_trades.csv - 46403 KB
fear_greed.csv - 88 KB





In [2]:
import pandas as pd

# Load Hyperliquid trades
trades = pd.read_csv('hyperliquid_trades.csv', low_memory=False)

# Load Fear & Greed data
fg = pd.read_csv('fear_greed.csv', low_memory=False)

# Check shapes
print('Trades - rows, columns:', trades.shape)
print('Fear & Greed - rows, columns:', fg.shape)


Trades - rows, columns: (211224, 16)
Fear & Greed - rows, columns: (2644, 4)


In [3]:
# Look at the top rows
display(trades.head())
display(fg.head())

Unnamed: 0,Account,Coin,Execution Price,Size Tokens,Size USD,Side,Timestamp IST,Start Position,Direction,Closed PnL,Transaction Hash,Order ID,Crossed,Fee,Trade ID,Timestamp
0,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9769,986.87,7872.16,BUY,02-12-2024 22:50,0.0,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.345404,895000000000000.0,1730000000000.0
1,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.98,16.0,127.68,BUY,02-12-2024 22:50,986.524596,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.0056,443000000000000.0,1730000000000.0
2,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9855,144.09,1150.63,BUY,02-12-2024 22:50,1002.518996,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.050431,660000000000000.0,1730000000000.0
3,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9874,142.98,1142.04,BUY,02-12-2024 22:50,1146.558564,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.050043,1080000000000000.0,1730000000000.0
4,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9894,8.73,69.75,BUY,02-12-2024 22:50,1289.488521,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.003055,1050000000000000.0,1730000000000.0


Unnamed: 0,timestamp,value,classification,date
0,1517463000,30,Fear,2018-02-01
1,1517549400,15,Extreme Fear,2018-02-02
2,1517635800,40,Fear,2018-02-03
3,1517722200,24,Extreme Fear,2018-02-04
4,1517808600,11,Extreme Fear,2018-02-05


In [4]:
print("Trades column types:")
display(trades.dtypes)

print("\nFear & Greed column types:")
display(fg.dtypes)


Trades column types:


Account              object
Coin                 object
Execution Price     float64
Size Tokens         float64
Size USD            float64
Side                 object
Timestamp IST        object
Start Position      float64
Direction            object
Closed PnL          float64
Transaction Hash     object
Order ID              int64
Crossed                bool
Fee                 float64
Trade ID            float64
Timestamp           float64
dtype: object


Fear & Greed column types:


timestamp          int64
value              int64
classification    object
date              object
dtype: object

In [12]:
# 1. Standardize column names
trades.columns = [c.strip().lower().replace(' ', '_') for c in trades.columns]
fg.columns = [c.strip().lower().replace(' ', '_') for c in fg.columns]

# 2. Parse the trades timestamp
# We'll use 'timestamp_ist' column because it's already human-readable
trades['time'] = pd.to_datetime(trades['timestamp_ist'], errors='coerce')

# 3. Parse Fear & Greed date
fg['date'] = pd.to_datetime(fg['date'], errors='coerce')

# 4. Create date-only columns for merging
trades['date'] = trades['time'].dt.date
fg['date_only'] = fg['date'].dt.date

# 5. Map sentiment to binary
fg['sentiment_bin'] = fg['classification'].str.lower().map(
    lambda x: 1 if 'greed' in x else (0 if 'fear' in x else pd.NA)
)

# Quick check
print(trades[['time', 'date']].head())
print(fg[['date', 'date_only', 'classification', 'sentiment_bin']].head())


                 time        date
0 2024-02-12 22:50:00  2024-02-12
1 2024-02-12 22:50:00  2024-02-12
2 2024-02-12 22:50:00  2024-02-12
3 2024-02-12 22:50:00  2024-02-12
4 2024-02-12 22:50:00  2024-02-12
        date   date_only classification sentiment_bin
0 2018-02-01  2018-02-01           Fear             0
1 2018-02-02  2018-02-02   Extreme Fear             0
2 2018-02-03  2018-02-03           Fear             0
3 2018-02-04  2018-02-04   Extreme Fear             0
4 2018-02-05  2018-02-05   Extreme Fear             0


In [None]:
# Ensure numeric columns in trades are correctly typed
numeric_cols = [
    'execution_price', 'size_tokens', 'size_usd',
    'start_position', 'closed_pnl', 'fee', 'trade_id', 'timestamp'
]

for col in numeric_cols:
    if col in trades.columns:
        trades[col] = pd.to_numeric(trades[col], errors='coerce')

# Rename closed_pnl to a canonical form
trades = trades.rename(columns={'closed_pnl': 'closedpnl'})

# Quick summary to check for issues
print(trades[['execution_price', 'size_tokens', 'size_usd', 'closedpnl']].describe())


       execution_price   size_tokens      size_usd      closedpnl
count    211224.000000  2.112240e+05  2.112240e+05  211224.000000
mean      11414.723350  4.623365e+03  5.639451e+03      48.749001
std       29447.654868  1.042729e+05  3.657514e+04     919.164828
min           0.000005  8.740000e-07  0.000000e+00 -117990.104100
25%           4.854700  2.940000e+00  1.937900e+02       0.000000
50%          18.280000  3.200000e+01  5.970450e+02       0.000000
75%         101.580000  1.879025e+02  2.058960e+03       5.792797
max      109004.000000  1.582244e+07  3.921431e+06  135329.090100


In [None]:
# Merge-friendly map from sentiment dataset
fg_map = fg.set_index('date_only')['sentiment_bin']

# Daily aggregation
daily = trades.groupby('date').agg(
    total_trades=('account', 'count'),
    sum_pnl=('closedpnl', 'sum'),
    mean_pnl=('closedpnl', 'mean'),
    median_pnl=('closedpnl', 'median'),
    pct_profitable=('closedpnl', lambda x: (x > 0).mean()),
    avg_leverage=('start_position', 'mean'),   # Assuming start_position is leverage
    median_size=('size_tokens', 'median')
).reset_index()

# Merge sentiment info
daily['sentiment'] = daily['date'].map(fg_map)


In [None]:
acct = trades.groupby('account').agg(
    trades_count=('date', 'count'),
    total_pnl=('closedpnl', 'sum'),
    avg_pnl=('closedpnl', 'mean'),
    win_rate=('closedpnl', lambda x: (x > 0).mean()),
    avg_leverage=('start_position', 'mean')   # Assuming start_position is leverage
).reset_index()


In [17]:
# Save to csv_files/
daily.to_csv('csv_files/daily_agg.csv', index=False)
acct.to_csv('csv_files/account_agg.csv', index=False)
trades.to_csv('csv_files/trades_canonical.csv', index=False)

print("Saved daily_agg.csv, account_agg.csv, and trades_canonical.csv in csv_files/")


Saved daily_agg.csv, account_agg.csv, and trades_canonical.csv in csv_files/


In [18]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')


In [21]:
fig, ax = plt.subplots(figsize=(12,4))

# 7-day rolling PnL line
ax.plot(pd.to_datetime(daily['date']),
        daily['sum_pnl'].rolling(7, min_periods=1).mean(),
        label='7-day rolling sum_pnl')

# Highlight Greed days
for _, row in daily.dropna(subset=['sentiment']).iterrows():
    if row['sentiment'] == 1:
        ax.axvspan(pd.to_datetime(row['date']),
                   pd.to_datetime(row['date']) + pd.Timedelta(days=1),
                   alpha=0.08, color='green')

ax.set_title('Daily sum PnL (7-day rolling) — shaded = Greed days')
ax.legend()
plt.tight_layout()
plt.savefig('outputs/daily_pnl_sentiment_timeseries.png', dpi=150)
plt.close()


In [22]:
trades_sent = trades.merge(
    fg[['date_only','classification']],
    left_on='date', right_on='date_only',
    how='left'
)

plt.figure(figsize=(8,5))
sns.boxplot(data=trades_sent, x='classification', y='closedpnl')
plt.yscale('symlog')  # better view for large PnL spikes
plt.title('Trade closedPnL distribution by Sentiment')
plt.savefig('outputs/boxplot_trade_pnl_by_sentiment.png', dpi=150)
plt.close()


In [23]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=acct,
                x='avg_leverage', y='total_pnl',
                size='trades_count', alpha=0.7)
plt.title('Account total PnL vs avg leverage')
plt.savefig('outputs/account_pnl_vs_leverage.png', dpi=150)
plt.close()


In [24]:
from scipy.stats import mannwhitneyu

# Get the sets of dates for each sentiment
fear_dates = fg[fg['sentiment_bin'] == 0]['date_only']
greed_dates = fg[fg['sentiment_bin'] == 1]['date_only']

# Filter trades based on these dates
pnl_fear = trades[trades['date'].isin(fear_dates)]['closedpnl'].dropna()
pnl_greed = trades[trades['date'].isin(greed_dates)]['closedpnl'].dropna()

print("Fear trades:", len(pnl_fear), "Greed trades:", len(pnl_greed))


Fear trades: 52726 Greed trades: 82885


In [25]:
if len(pnl_fear) > 10 and len(pnl_greed) > 10:
    stat, p = mannwhitneyu(pnl_fear, pnl_greed, alternative='two-sided')
    print(f"Mann–Whitney U statistic = {stat:.2f}, p-value = {p:.5f}")
    if p < 0.05:
        print("Result: Statistically significant difference between Fear and Greed day PnLs.")
    else:
        print("Result: No statistically significant difference found.")
else:
    print("Not enough data in one of the groups to run a valid test.")


Mann–Whitney U statistic = 2104229251.50, p-value = 0.00000
Result: Statistically significant difference between Fear and Greed day PnLs.


In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# Ensure daily is sorted by date
daily_sorted = daily.copy()
daily_sorted['date_dt'] = pd.to_datetime(daily_sorted['date'])
daily_sorted = daily_sorted.sort_values('date_dt')

# Create lag features
daily_sorted['sentiment_lag1'] = daily_sorted['sentiment'].shift(1)
daily_sorted['sum_pnl_lag1'] = daily_sorted['sum_pnl'].shift(1)
daily_sorted['sum_pnl_rolling7'] = daily_sorted['sum_pnl'].rolling(7, min_periods=1).mean().shift(1)

# Drop rows with NaNs (from shifts)
model_df = daily_sorted.dropna(subset=['sum_pnl','sentiment_lag1','sum_pnl_lag1'])

# Features and target
X = model_df[['sentiment_lag1','sum_pnl_lag1','sum_pnl_rolling7','total_trades','avg_leverage']].fillna(0)
y = model_df['sum_pnl']


In [27]:
split_idx = int(0.7 * len(model_df))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)

print("MAE:", mean_absolute_error(y_test, pred))
print("R²:", r2_score(y_test, pred))

MAE: 62919.37047361377
R²: 0.037890426548481404


In [28]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf, 'csv_files/rf_daily_pnl_model.joblib')

print("Model saved to csv_files/rf_daily_pnl_model.joblib")

Model saved to csv_files/rf_daily_pnl_model.joblib


In [29]:
# Load the model back
rf_loaded = joblib.load('csv_files/rf_daily_pnl_model.joblib')

# Test that it gives same predictions
pred_loaded = rf_loaded.predict(X_test)
print("Reloaded model MAE:", mean_absolute_error(y_test, pred_loaded))

Reloaded model MAE: 62919.37047361377
