In [None]:
import pandas as pd
from src.config import PRICE_FILE  # path to prices.parquet

# ⬇️  tell pandas not to truncate anything
pd.set_option("display.max_rows",    None)   # show all rows
pd.set_option("display.max_columns", None)   # show all columns
pd.set_option("display.width",       None)   # don't wrap long lines
pd.set_option("display.max_colwidth", None)  # don't truncate column names

df = pd.read_parquet(PRICE_FILE)

df    # or display(df) – will now render the entire table

In [14]:
import pandas as pd
gt  = pd.read_csv("data/PROXY_MATCHES_TRAINING_DATA(in).csv")[["target", "proxy"]]
res = pd.read_csv("data/match_top10.csv")
k = 10  # number of top proxies to consider
merged = gt.merge(res, on="target", how="left")
top1  = (merged["proxy"] == merged["proxy1"]).mean()
topk  = merged.apply(
          lambda r: r["proxy"] in [r[f"proxy{i}"] for i in range(1,11)], axis=1
        ).mean()
print("Top-1  :", round(top1*100, 1), "%")
print("Top-10  :", round(topk*100, 1), "%")

Top-1  : 14.9 %
Top-10  : 54.9 %


In [36]:
import pandas as pd

# ── 1. load & clean ground-truth table ─────────────────────────────
gt = (
    pd.read_csv("data/PROXY_MATCHES_TRAINING_DATA(in).csv")[["target", "proxy"]]
      .dropna()                                   # drop blank rows
      .drop_duplicates(subset="target")           # keep one row per target
      .assign(proxy=lambda d: d["proxy"].str.strip().str.upper())
)

# ── 2. load & clean predictions (proxy1 … proxy10) ─────────────────
res = pd.read_csv("data/match_top10.csv")
for i in range(1, 11):
    res[f"proxy{i}"] = res[f"proxy{i}"].str.strip().str.upper()

# ── 3. align by target & evaluate ─────────────────────────────────
merged = gt.merge(res, on="target", how="left")

top1_hits = merged["proxy"] == merged["proxy1"]
top1_num  = top1_hits.sum()
top1_den  = len(gt)

top10_hits = merged.apply(
    lambda r: r["proxy"] in [r.get(f"proxy{i}") for i in range(1, 11)], axis=1
)
top10_num = top10_hits.sum()
top10_den = len(gt)

# ── 4. print results ──────────────────────────────────────────────
print(f"Ground-truth rows   : {top1_den}")
print(f"Predictions rows    : {len(res)}\n")

print(f"Top-1 accuracy      : {top1_num}/{top1_den}  ({top1_num/top1_den:.1%})")
print(f"Top-10 accuracy     : {top10_num}/{top10_den}  ({top10_num/top10_den:.1%})")



Ground-truth rows   : 270
Predictions rows    : 354

Top-1 accuracy      : 51/270  (18.9%)
Top-10 accuracy     : 123/270  (45.6%)


In [53]:
import pandas as pd

# ── 1. load & clean ground-truth table ─────────────────────────────
gt = (
    pd.read_csv("data/PROXY_MATCHES_TRAINING_DATA(in).csv")[["target", "proxy"]]
      .dropna()                                   # drop blank rows
      .drop_duplicates(subset="target")           # keep one row per target
      .assign(proxy=lambda d: d["proxy"].str.strip().str.upper())
)

# ── 2. load & clean predictions (proxy1 … proxy10) ─────────────────
res = pd.read_csv("data/match_top500.csv")
for i in range(1, 501):
    res[f"proxy{i}"] = res[f"proxy{i}"].str.strip().str.upper()


# ── 3. align by target & evaluate ─────────────────────────────────
merged = gt.merge(res, on="target", how="left")

top1_hits = merged["proxy"] == merged["proxy1"]
top1_num  = top1_hits.sum()
top1_den  = len(gt)

top100_hits = merged.apply(
    lambda r: r["proxy"] in [r.get(f"proxy{i}") for i in range(1, 11)], axis=1
)
top100_num = top100_hits.sum()
top100_den = len(gt)

# ── 4. print results ──────────────────────────────────────────────
print(f"Ground-truth rows   : {top1_den}")
print(f"Predictions rows    : {len(res)}\n")



print(f"Top-1 accuracy      : {top1_num}/{top1_den}  ({top1_num/top1_den:.1%})")
print(f"Top-500 accuracy     : {top100_num}/{top100_den}  ({top100_num/top100_den:.1%})")

Ground-truth rows   : 270
Predictions rows    : 354

Top-1 accuracy      : 50/270  (18.5%)
Top-500 accuracy     : 120/270  (44.4%)


In [51]:
import pandas as pd, json, duckdb, numpy as np

truth  = pd.read_csv("data/PROXY_MATCHES_TRAINING_DATA(in).csv")
truth  = truth.dropna(subset=["proxy"]).reset_index(drop=True)

matches = pd.read_csv("data/match_top500.csv")          # your latest run
proxy_profiles = json.load(open("data/profiles/proxy.json"))

# Get price matrix columns once
cols = duckdb.sql("SELECT * FROM 'data/prices.parquet'").fetchdf().columns

def bucket(row):
    target = row["target"]
    real   = row["proxy"]          # ground-truth proxy
    # ➊ was it in candidate list at all?
    cand_cols = [c for c in matches.columns if c.startswith("proxy")]
    cand_set  = set(matches.loc[matches.target == target, cand_cols].values.ravel())
    if real in cand_set:
        if real in cols:
            return "OK, should count hit"      # sanity
        else:
            return "B  (price-NaN)"
    else:
        return "A  (filtered)"

truth["bucket"] = truth.apply(bucket, axis=1)
print(truth["bucket"].value_counts())


bucket
OK, should count hit    259
A  (filtered)            11
Name: count, dtype: int64
