
# Trader Behaviour × Market Sentiment — Starter Notebook

**Assignment**: Explore how trading behaviour (profitability, risk, volume, leverage) aligns/diverges from market sentiment (Fear vs Greed).  
**Output**: Save processed CSVs to `csv_files/` and plots to `outputs/`. Export your report as **`ds_report.pdf`**.

> Tip: Keep this notebook lean and reproducible. Avoid hard-coding file paths.


In [None]:
import os
from pathlib import Path

# Always anchor to /content in Colab
ROOT = Path("/content/ds_arkin_kansra")
CSV_DIR = ROOT / "csv_files"
OUT_DIR = ROOT / "outputs"

# Make full directory tree
CSV_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Folders ready:")
print("ROOT:", ROOT.resolve())
print("CSV_DIR:", CSV_DIR.resolve())
print("OUT_DIR:", OUT_DIR.resolve())


## 1) Download Datasets from Google Drive (Public Links)

In [None]:
from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import io, os
from pathlib import Path

# === Anchor everything under /content/ds_arkin_kansra ===
ROOT = Path("/content/ds_arkin_kansra")
CSV_DIR = ROOT / "csv_files"
CSV_DIR.mkdir(parents=True, exist_ok=True)

drive_service = build('drive', 'v3')

# Files to download: {file_id: local_path}
files = {
    "1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs": str(CSV_DIR / "hyperliquid_trades.csv"),  # Trader data
    "1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf": str(CSV_DIR / "fear_greed.csv")           # Fear & Greed
}

for file_id, out_path in files.items():
    try:
        request = drive_service.files().get_media(fileId=file_id)
        fh = io.FileIO(out_path, "wb")
        downloader = MediaIoBaseDownload(fh, request)

        done = False
        while not done:
            status, done = downloader.next_chunk()
            if status:
                print(f"Downloading {out_path}: {int(status.progress() * 100)}%")
        print(f" Finished downloading {out_path}")
    except Exception as e:
        print(f"Could not download {out_path} from Drive. Please upload manually. Error: {e}")


In [None]:
# Parse trader timestamps
if 'timestamp' in trades.columns:
    ts_sample = trades['timestamp'].dropna().iloc[0]
    if ts_sample > 1e11:  # ms
        trades['time'] = pd.to_datetime(trades['timestamp'], unit='ms', errors='coerce', utc=True).dt.tz_convert(None)
    else:  # s
        trades['time'] = pd.to_datetime(trades['timestamp'], unit='s', errors='coerce', utc=True).dt.tz_convert(None)
    trades['date'] = trades['time'].dt.date
elif 'timestamp_ist' in trades.columns:
    trades['time'] = pd.to_datetime(trades['timestamp_ist'], errors='coerce', dayfirst=True)
    trades['date'] = trades['time'].dt.date
else:
    raise ValueError("No usable timestamp column found in trader dataset.")

print("Parsed trader dates:", trades['date'].min(), "→", trades['date'].max(),
      "| unique days:", trades['date'].nunique())

# Sentiment dataset
senti['date'] = pd.to_datetime(senti['date'], errors='coerce').dt.date
print("Parsed sentiment dates:", senti['date'].min(), "→", senti['date'].max(),
      "| n=", len(senti))


In [None]:
# Column aliases
col_map = {
    "size": ["size_usd", "size_tokens", "size", "qty", "quantity", "amount"],
    "execution_price": ["execution_price", "price", "avg_price"],
    "leverage": ["leverage", "lev"],
    "closedpnl": ["closed_pnl", "pnl", "profit", "realized_pnl"]
}

selected = {}
for target, candidates in col_map.items():
    for c in candidates:
        if c in trades.columns:
            selected[target] = c
            break

print("Detected columns:", selected)

# Normalize side
if "side" in trades.columns:
    trades["side"] = trades["side"].astype(str).str.lower().map({"buy":"long","sell":"short"}).fillna(trades["side"])

# Profitability proxy
if selected.get("closedpnl"):
    trades["is_win"] = (trades[selected["closedpnl"]] > 0).astype(int)
else:
    trades["is_win"] = np.nan

# Aggregations
agg_funcs = {}
if selected.get("size"): agg_funcs[selected["size"]] = "sum"
if selected.get("execution_price"): agg_funcs[selected["execution_price"]] = "mean"
if selected.get("leverage"): agg_funcs[selected["leverage"]] = "mean"
agg_funcs["is_win"] = "mean"
if selected.get("closedpnl"): agg_funcs[selected["closedpnl"]] = ["sum","mean","median","std"]

daily = trades.groupby("date").agg(agg_funcs)

# Flatten + rename
daily.columns = ["_".join([c for c in col if c]).strip("_") for col in daily.columns.values]
rename_map = {}
if selected.get("size"): rename_map[f"{selected['size']}_sum"] = "volume"
if selected.get("execution_price"): rename_map[f"{selected['execution_price']}_mean"] = "avg_price"
if selected.get("leverage"): rename_map[f"{selected['leverage']}_mean"] = "avg_leverage"
rename_map["is_win_mean"] = "win_rate"
if selected.get("closedpnl"):
    rename_map.update({
        f"{selected['closedpnl']}_sum": "pnl_sum",
        f"{selected['closedpnl']}_mean": "pnl_mean",
        f"{selected['closedpnl']}_median": "pnl_median",
        f"{selected['closedpnl']}_std": "pnl_std"
    })
daily = daily.rename(columns=rename_map).reset_index()

# Long/Short imbalance
if "side" in trades.columns and selected.get("size"):
    side_daily = trades.pivot_table(index="date", columns="side", values=selected["size"], aggfunc="sum", fill_value=0).reset_index()
    if "long" not in side_daily.columns: side_daily["long"] = 0.0
    if "short" not in side_daily.columns: side_daily["short"] = 0.0
    side_daily["long_short_imbalance"] = (side_daily["long"] - side_daily["short"]) / (side_daily["long"] + side_daily["short"]).replace(0, np.nan)
    daily = daily.merge(side_daily[["date","long","short","long_short_imbalance"]], on="date", how="left")

print("Daily features shape:", daily.shape)
print(daily.head())


In [None]:
senti['classification'] = senti['classification'].str.strip().str.lower()
senti['sentiment_flag'] = senti['classification'].map({
    'fear': 0, 'extreme fear': 0,
    'greed': 1, 'extreme greed': 1
})

df = pd.merge(
    daily,
    senti[['date','classification','value','sentiment_flag']],
    on='date',
    how='outer'   # keep all dates
).sort_values('date')

print("Final merged dataset shape:", df.shape)
print("Overlap rows (non-null both sides):", df.dropna(subset=['volume','sentiment_flag']).shape)
print(df.head(15))


In [None]:
def safe_plot(df, col, title, ylabel, fname):
    if col in df.columns and df[col].notna().any():
        plt.figure(figsize=(10,4))
        plt.plot(pd.to_datetime(df['date']), df[col])
        plt.title(title)
        plt.xlabel("Date"); plt.ylabel(ylabel)
        plt.savefig(OUT_DIR/fname, dpi=150, bbox_inches="tight")
        plt.show()
    else:
        print(f"Skipping {col} (not found or empty)")

# Trader-only metrics
safe_plot(daily, "volume", "Daily Trader Volume", "Volume", "daily_volume.png")
safe_plot(daily, "avg_price", "Average Execution Price", "Avg Price", "avg_price.png")
safe_plot(daily, "win_rate", "Win Rate", "Win Rate", "win_rate.png")
safe_plot(daily, "pnl_sum", "PnL Sum", "PnL Sum", "pnl_sum.png")

# Sentiment trend
safe_plot(senti, "value", "Fear & Greed Index", "Index Value", "fear_greed_index.png")

# Overlap subset
df_overlap = df.dropna(subset=['volume','sentiment_flag'])
if not df_overlap.empty:
    plt.figure(figsize=(10,4))
    plt.scatter(df_overlap['sentiment_flag'], df_overlap['volume'])
    plt.title("Overlap: Sentiment vs Trader Volume")
    plt.xlabel("Sentiment (0=Fear,1=Greed)")
    plt.ylabel("Volume")
    plt.savefig(OUT_DIR/"overlap_sentiment_volume.png", dpi=150, bbox_inches="tight")
    plt.show()
else:
    print(" No overlapping days between sentiment & trader data.")


In [None]:
df_overlap = df.dropna(subset=['volume','sentiment_flag'])
corrs = {}

if not df_overlap.empty:
    for col in ['volume','avg_price','win_rate','pnl_sum','pnl_mean','long_short_imbalance']:
        if col in df_overlap.columns:
            cor = df_overlap[col].corr(df_overlap['sentiment_flag'])
            corrs[col] = cor

    corr_df = pd.DataFrame.from_dict(corrs, orient='index', columns=['same_day_corr']).sort_values('same_day_corr', ascending=False)
    corr_df.to_csv(CSV_DIR/'overlap_correlations.csv')
    print("Overlap correlations:")
    print(corr_df.round(3))
else:
    print("No overlap rows to compute correlations.")


In [None]:
results = []
if not df_overlap.empty:
    for k in ['volume','avg_price','win_rate','pnl_sum','pnl_mean','long_short_imbalance']:
        if k in df_overlap.columns:
            x0 = df_overlap.loc[df_overlap['sentiment_flag']==0, k].dropna()
            x1 = df_overlap.loc[df_overlap['sentiment_flag']==1, k].dropna()
            if len(x0) > 1 and len(x1) > 1:  # lower threshold since data is tiny
                t, p = stats.ttest_ind(x0, x1, equal_var=False)
                results.append({'metric': k, 'fear_mean': x0.mean(), 'greed_mean': x1.mean(), 't_stat': t, 'p_value': p})

if results:
    regime_df = pd.DataFrame(results).sort_values('p_value')
    regime_df.to_csv(CSV_DIR/'regime_differences.csv', index=False)
    print("Significant regime differences (small-sample):")
    print(regime_df.round(3))
else:
    print(" No regime differences computed (tiny overlap).")


In [None]:
summary = {
    'n_trader_days': int(daily['date'].nunique()),
    'trader_date_range': [str(daily['date'].min()), str(daily['date'].max())],
    'n_sentiment_days': int(senti['date'].nunique()),
    'sentiment_date_range': [str(senti['date'].min()), str(senti['date'].max())],
    'n_overlap_days': int(df_overlap['date'].nunique()),
}

with open(CSV_DIR/'summary.json','w') as f:
    json.dump(summary, f, indent=2)

print("Saved summary.json")
summary


In [None]:
# === Cell 11: Rebuild outputs & generate ds_report.pdf ===
import json, textwrap, glob
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from fpdf import FPDF
from scipy import stats

ROOT = Path("/content/ds_arkin_kansra")
CSV_DIR = ROOT / "csv_files"
OUT_DIR = ROOT / "outputs"
CSV_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# -----------------------------
# 1. Rebuild summary.json if missing
# -----------------------------
if not (CSV_DIR/"summary.json").exists():
    summary = {
        'n_trader_days': int(daily['date'].nunique()),
        'trader_date_range': [str(daily['date'].min()), str(daily['date'].max())],
        'n_sentiment_days': int(senti['date'].nunique()),
        'sentiment_date_range': [str(senti['date'].min()), str(senti['date'].max())],
        'n_overlap_days': int(df.dropna(subset=['volume','sentiment_flag'])['date'].nunique()),
    }
    with open(CSV_DIR/'summary.json','w') as f:
        json.dump(summary, f, indent=2)
    print("✅ Rebuilt summary.json")

with open(CSV_DIR/"summary.json") as f:
    summary = json.load(f)

# -----------------------------
# 2. Rebuild correlations if missing
# -----------------------------
corr_path = CSV_DIR/"overlap_correlations.csv"
if not corr_path.exists():
    df_overlap = df.dropna(subset=['volume','sentiment_flag'])
    corrs = {}
    if not df_overlap.empty:
        for col in ['volume','avg_price','win_rate','pnl_sum','pnl_mean','long_short_imbalance']:
            if col in df_overlap.columns:
                corrs[col] = df_overlap[col].corr(df_overlap['sentiment_flag'])
        pd.DataFrame.from_dict(corrs, orient='index', columns=['same_day_corr']).to_csv(corr_path)
        print("✅ Rebuilt overlap_correlations.csv")

corr_df = pd.read_csv(corr_path) if corr_path.exists() else None

# -----------------------------
# 3. Rebuild regime differences if missing
# -----------------------------
regime_path = CSV_DIR/"regime_differences.csv"
if not regime_path.exists():
    results = []
    if not df_overlap.empty:
        for k in ['volume','avg_price','win_rate','pnl_sum','pnl_mean','long_short_imbalance']:
            if k in df_overlap.columns:
                x0 = df_overlap.loc[df_overlap['sentiment_flag']==0, k].dropna()
                x1 = df_overlap.loc[df_overlap['sentiment_flag']==1, k].dropna()
                if len(x0) > 1 and len(x1) > 1:
                    t, p = stats.ttest_ind(x0, x1, equal_var=False)
                    results.append({'metric': k, 'fear_mean': x0.mean(), 'greed_mean': x1.mean(), 't_stat': t, 'p_value': p})
    if results:
        pd.DataFrame(results).to_csv(regime_path, index=False)
        print("✅ Rebuilt regime_differences.csv")

regime_df = pd.read_csv(regime_path) if regime_path.exists() else None

# -----------------------------
# 4. PDF Report
# -----------------------------
pdf = FPDF()
pdf.set_left_margin(15)
pdf.set_right_margin(15)
pdf.add_page()

# Title
pdf.set_font("helvetica", 'B', 16)
pdf.cell(0, 10, "Data Science Assignment Report", new_x="LMARGIN", new_y="NEXT", align="C")

pdf.set_font("helvetica", '', 12)
pdf.ln(10)
pdf.multi_cell(0, 8, "Candidate: Arkin Kansra")
pdf.ln(5)

# Section 1: Summary
pdf.set_font("helvetica", 'B', 14)
pdf.cell(0, 10, "1. Dataset Summary", new_x="LMARGIN", new_y="NEXT")
pdf.set_font("helvetica", '', 12)

for k, v in summary.items():
    # convert list → comma-separated, everything to str
    if isinstance(v, list):
        val = ", ".join(map(str, v))
    else:
        val = str(v)

    # clean up underscores and brackets
    k_pretty = k.replace("_", " ").title()
    safe_val = val.replace("[", "").replace("]", "").replace("'", "").replace("_"," ")

    line = f"- {k_pretty}: {safe_val}"

    # hard wrap text safely
    chunks = textwrap.wrap(line, width=90, break_long_words=True, break_on_hyphens=True)
    if not chunks:  # edge case: empty line
        chunks = [line]

    for chunk in chunks:
        pdf.multi_cell(190, 8, chunk)   # fixed width instead of 0
pdf.ln(5)


# Section 2: Correlations
if corr_df is not None and not corr_df.empty:
    pdf.set_font("helvetica", 'B', 14)
    pdf.cell(0, 10, "2. Correlation Results", new_x="LMARGIN", new_y="NEXT")
    pdf.set_font("helvetica", '', 12)

    for _, row in corr_df.iterrows():
        metric = row.iloc[0] if 'Unnamed: 0' in row.index else str(row.name)
        safe_metric = metric.replace("_", " ")
        line = f"{safe_metric}: {row['same_day_corr']:.3f}"

        chunks = textwrap.wrap(line, width=90, break_long_words=True, break_on_hyphens=True)
        if not chunks:
            chunks = [line]
        for chunk in chunks:
            pdf.multi_cell(190, 8, chunk)
    pdf.ln(5)


# Section 3: Regime Differences
if regime_df is not None and not regime_df.empty:
    pdf.set_font("helvetica", 'B', 14)
    pdf.cell(0, 10, "3. Regime Differences (Fear vs Greed)", new_x="LMARGIN", new_y="NEXT")
    pdf.set_font("helvetica", '', 12)

    for _, row in regime_df.iterrows():
        safe_metric = str(row['metric']).replace("_", " ")
        line = (f"{safe_metric} | Fear mean={row['fear_mean']:.3f}, "
                f"Greed mean={row['greed_mean']:.3f}, "
                f"p={row['p_value']:.3f}")

        chunks = textwrap.wrap(line, width=90, break_long_words=True, break_on_hyphens=True)
        if not chunks:
            chunks = [line]
        for chunk in chunks:
            pdf.multi_cell(190, 8, chunk)
    pdf.ln(5)

# Section 4: Visualizations
pdf.set_font("helvetica", 'B', 14)
pdf.cell(0, 10, "4. Key Visualizations", new_x="LMARGIN", new_y="NEXT")
pdf.set_font("helvetica", '', 12)
pdf.multi_cell(0, 8, "Below are the main charts generated during analysis:")
pdf.ln(5)

for img_path in sorted(glob.glob(str(OUT_DIR/"*.png")) + glob.glob(str(OUT_DIR/"*.jpg"))):
    pdf.add_page()
    pdf.set_font("helvetica", 'I', 12)
    pdf.cell(0, 10, f"Figure: {Path(img_path).name}", new_x="LMARGIN", new_y="NEXT")
    pdf.image(img_path, x=15, w=180)

# Save
out_path = ROOT/"ds_report.pdf"
pdf.output(str(out_path))
print(f"✅ Report generated at {out_path.resolve()}")
