# Matching markets using embeddings

Semantically match markets by event title and market description across platforms using embeddings.

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
pd.options.display.max_columns = 200

# Models to use: dunzhang/stella_en_1.5B_v5, dunzhang/stella_en_400M_v5
model = SentenceTransformer('dunzhang/stella_en_1.5B_v5', trust_remote_code=True).cuda()
# Load market data
kalshi = pd.read_json('kalshi_markets_2025-04-23.json')
polymarket = pd.read_json('polymarket_markets_2025-04-23.json')

# Create a column for the retrieval
kalshi["bet_description"] = kalshi["title"] + " " + kalshi["subtitle"] + "\n" + kalshi['rules_primary'] + "\nEnd date: " + str(kalshi["close_time"])
polymarket["bet_description"] = polymarket["question"] + " " + polymarket["description"] + "\n" + "\nEnd date: " + polymarket["end_date_iso"]
polymarket_subset = polymarket.dropna(subset=["bet_description"])
kalshi_subset = kalshi.dropna(subset=["bet_description"])
kalshi_subset.drop_duplicates(subset=["event_ticker"], inplace=True)
print("Polymarkets", len(polymarket_subset))
print("Kalshi", len(kalshi_subset))


In [None]:
polymarket_subset.iloc[9307]

## Start inference

In [None]:
!mkdir /polymarket/embeddings/

In [None]:
doc_embeddings = model.encode(polymarket_subset['bet_description'].tolist())
# save embeddings
np.save('polymarket_embeddings.npy', doc_embeddings)

In [None]:
query_embeddings = model.encode(
    ("""Instruct: Given a prediction market event, retrieve the exact matching prediction based on date and rules.\nQuery: """ + kalshi_subset['bet_description']
    ).tolist())

# save embeddings
np.save('kalshi_embeddings.npy', query_embeddings)

In [None]:
similarities = model.similarity(query_embeddings, doc_embeddings)
top_5_prob, top_5 = similarities.sort(1,descending=True)

In [None]:
top_5_prob[:15,:5]

In [None]:
search = pd.DataFrame(top_5)

In [None]:
result_map = {i:q for i, q in enumerate(polymarket_subset["question"])}

In [None]:
search["question"] = kalshi_subset["title"].tolist()

In [None]:
search = search.replace(result_map)

In [None]:
search.head(20)