In [4]:
import pandas as pd
import plotly.express as px
import ipywidgets as w
from ipywidgets.embed import embed_minimal_html 
from ipywidgets import SelectMultiple, VBox, Output
from IPython.display import display
import plotly.express as px
import warnings 
warnings.simplefilter("ignore")

sire_data = pd.read_csv("sire_data.csv", dtype={
    "gini_coef": "float32",
    "median_price": "float32",
    "avg_price": "float32",
    "foals_per_year": "float32",
    "years_active": "int16",
})

only_sold = pd.read_csv("only_sold.csv",
                        dtype={"Price": "float32",
                               "sale_year": "int16"},
                        usecols=["Sire",
                                 "Description",
                                 "Price",
                                 "sale_year",
                                 "Purchaser"])

___

## Overview
I included 3 interactive visuals below based on Keeneland Sept Yearling sales data from 2018-2024. The first visual allows you to look at Sire performances by year - the other two have more detailed explanations. I'll do more work on individual purchasers and sellers at auctions next while also cleaning and integrating OBS breeze data. 

I'd like to build a model that can estimate the price of a 2yo in the OBS sale based on their breeze time along with yearling sales prices, family performance, etc. The OBS results could then be fed back into another model predicting yearling sales, which creates a constantly improving feedback loop that adjusts in real-time. 

[Racing Squared](https://racingsquared.com/) is an example of what's possible: they use pose estimation and computer vision to assess a horse's conformation automatically in addition to all data points I mentioned above. This is an area I've worked in before and fortunately Keeneland/OBS/etc all post walking videos of their horses, so building a similar conformation-based AI tool is do-able given time. But for now...a few visuals.

___

### Sire Sales Performance by Year
- Search for a sire and select
- Clear selections with CTRL+"click" or CMD+"click" on each sires name or by deleting the typed entry

This is a little cumbersome to use, but was easiest to make quickly. You can exclude sales prices in the 95th percentile or higher using the "Excluding..." option in the Data filter. I would select "Excluding..." when looking at all data unless you specifically want to see some of the high priced outliers, otherwise the visual is compressed and uninterpretable.

The "Full" option will be more useful when looking at individual sires. 

**Notes:**
- When you highlight each box within each year you'll see a bunch of values.
    - upper_fence/lower_fence: These basically mark the start of where true outliers begin. They fluctuate based on the data but it's safe to assume they sit somewhere around the 10th and 90th percentiles.
    - q3: 75th percentile
    - median: 50th percentile
    - q1: 25th percentile

In [10]:
# ── build the two DataFrames up front ──────────────────────────
full_df     = only_sold.copy()
subset_df   = full_df[full_df.Price <= full_df.Price.quantile(.95)]

# a master list of sires (covers both data sets)
all_sires   = sorted(full_df["Sire"].unique())

# ── widgets ────────────────────────────────────────────────────
data_toggle = w.ToggleButtons(
    options=["Excluding >95th Percentile", "Full"],
    description="Data:"
)

sire_search      = w.Text(placeholder="Search sire…", layout=w.Layout(width="50%"))
sire_multiselect = w.SelectMultiple(options=all_sires, rows=8, layout=w.Layout(width="50%"))

fig_out = w.Output()

# --------------------------------------------------------------
def _filter_options(text_widget, multi_widget, full_list):
    """Live-filter the options in a SelectMultiple, keep existing selections."""
    pat  = text_widget.value.strip().lower()
    keep = multi_widget.value
    opts = [o for o in full_list if pat in o.lower()]
    multi_widget.options = opts
    # restore any choices that are still present after filtering
    multi_widget.value   = tuple(o for o in keep if o in opts)

sire_search.observe(
    lambda ch: _filter_options(sire_search, sire_multiselect, all_sires),
    names="value"
)

# --------------------------------------------------------------
def redraw(_=None):
    # 1 choose which DataFrame to plot
    df = subset_df if data_toggle.value.startswith("Excluding") else full_df

    # 2 apply the sire filter (if any)
    if sire_multiselect.value:
        df = df[df["Sire"].isin(sire_multiselect.value)]

    # 3 draw / update the figure
    with fig_out:
        fig_out.clear_output(wait=True)
        px.box(
            df, x="sale_year", y="Price",
            hover_data=["Sire", "Description", "Purchaser"],
            title="Keeneland Sept Yearling Sales by Sire"
        ).show()

# trigger redraw whenever a control changes
for widg in (data_toggle, sire_multiselect):
    widg.observe(redraw, names="value")

# initial plot
redraw()

# ── layout ────────────────────────────────────────────────────
display(
    w.VBox([
        data_toggle,
        w.HTML("<b>Sire filter</b>"),
        sire_search,
        sire_multiselect,
        fig_out
    ])
)

VBox(children=(ToggleButtons(description='Data:', options=('Excluding >95th Percentile', 'Full'), value='Exclu…

___

**Sire Sales Performance vs Gini**

"Gini" is traditionally used in research for things like income inequality because it captures how similar or different members of a population are from one another. A gini of 0.0 indicates the highest degree of similarity meaning all outcomes (or in this case yearling sales prices) are identical - if a sire's progeny sold in the range of 5k to 5M the sire would receive close to a 1.0 gini (extreme volatility). Conversely, a gini of 0.0 indicates zero variation for a sire's yearling prices. 

If a sire has a relatively high gini and a relatively low median price, that could indicate that the sire is more risky since the potential sales price is more volatile and is lower in general (see: Improbable). The opposite is true for Into Mischief whose low gini is matched by the highest median price. There's obvious bias here because high performing sires have their choice of optimal mares, but directionally I think the trend holds. If you want to see how younger sires are performing, then change the "Years active" slider.

**Notes:**
- I did not include "Out" and "RNA" values here which means this is skewed data. Directionally it's probably correct, but it may be the case that Blame (0.497 gini) RNAs 2x as much as Volatile (0.562 gini) potentially making Blame a less desireable option all things considered. 
- Gini is less reliable for Sires with low foal counts hence the large spread for sires under 50k - I would use a Foal count of 10 or more.
- Many of these sires have been active for more than 7 years, but I only have data back to 2017 so this is all I can work with.

In [14]:
def plot_dynamic(circle_size="foals_per_year"):
    """
    Interactive scatter with:
      • x-axis  : gini_coef
      • y-axis  : avg_price
      • size / colour : <circle_size> column
      • filters :
            foals_per_year ≥ F
            years_active between [Ymin, Ymax]
    """
    # ── widgets ────────────────────────────────────────────────
    foal_min = w.IntText(value=10, description="Foals per year ≥", step=1)

    yr_min, yr_max = int(sire_data.years_active.min()), int(sire_data.years_active.max())
    year_range = w.IntRangeSlider(
        value=[yr_min, yr_max], min=yr_min, max=yr_max, step=1,
        description="Years active",
        continuous_update=False, layout=w.Layout(width="75%")
    )

    fig_out = w.Output()

    # ── redraw helper ─────────────────────────────────────────
    def redraw(*_):
        lo, hi = year_range.value
        df = sire_data.loc[
            (sire_data.foals_per_year >= foal_min.value) &
            (sire_data.years_active.between(lo, hi))
        ]
        with fig_out:
            fig_out.clear_output(wait=True)
            px.scatter(
                df.reset_index(),
                x="gini_coef", y="median_price",
                size=circle_size, color=circle_size,
                hover_name="Sire",
                color_continuous_scale="plasma",
                title="Sire scatter (interactive thresholds)"
            ).show()

    # update on any control change
    foal_min.observe(redraw, names="value")
    year_range.observe(redraw, names="value")

    redraw()  # initial draw

    # ── lay out the controls and figure ───────────────────────
    display(w.VBox([
        w.HBox([foal_min, year_range]),
        fig_out
    ]))

# call the function to launch the UI
plot_dynamic("foals_per_year")        # or another column name

VBox(children=(HBox(children=(IntText(value=10, description='Foals per year ≥'), IntRangeSlider(value=(1, 7), …

___

### Gini vs Median Price Correlation

This visual tells is meant to tell us how the relationship between gini (sales price volatility) and median price evolve over time. A positive correlation tells us that when gini goes up (i.e. volatility increases) then median price also goes up, so the two move in the same direction. Normally I would expect this relationship to be negative because more volatility would equate to a less desireable sire, and a less desireable sire would lead to lower sales prices.

What's interesting is that the correlation is positive for sires in their second year. So a 2nd year crop tends to see **higher volatility present at the same time as higher prices** - i.e. buyers are taking big-time shots on horses and presumably gambling on a sire's first crop performing well as 2yo's. I would assume a sire's second crop is a highly active (and potentially highly profitable) time for good pinhookers hoping to flip at the spring breeze sales, but I would need data to back that up. 

**Notes:**
1. Filter by foals >= 10 at minimum, otherwise the small sample sires distort the picture.
2. Changing years active won't change the underlying correlations - I probably could have deleted it. 
3. Correlation statistics are inherently volatile themselves, so like the two outputs above I wouldn't interpret these too literally (yet).

In [23]:
df0 = sire_data.reset_index().copy()

# ── master lists / limits ─────────────────────────────────────────────────
yr_min, yr_max = int(df0.years_active.min()), int(df0.years_active.max())

# ── widgets ───────────────────────────────────────────────────────────────
foal_min = w.IntText(value=10, description="Foal ≥", step=1,
                     layout=w.Layout(width="150px"))

year_range = w.IntRangeSlider(
    value=[yr_min, yr_max], min=yr_min, max=yr_max, step=1,
    description="Years active", continuous_update=False,
    layout=w.Layout(width="70%")
)

plot_out = w.Output()

# ── helper: live-filter the SelectMultiple options list ───────────────────
def _filter_options(text_widget, multi_widget, full_list):
    pat = text_widget.value.strip().lower()
    keep = multi_widget.value
    opts = [o for o in full_list if pat in o.lower()]
    multi_widget.options = opts
    multi_widget.value = tuple(o for o in keep if o in opts)

sire_search.observe(
    lambda c: _filter_options(sire_search, sire_multiselect, all_sires),
    names="value"
)

# ── recompute + redraw ────────────────────────────────────────────────────
def redraw(_=None):
    lo, hi = year_range.value

    # 1 apply filters
    d = df0.loc[
        (df0.foals_per_year >= foal_min.value) &
        (df0.years_active.between(lo, hi))
    ]

    # 2 group by years_active and compute Pearson r
    corr_by_year = (
        d.groupby("years_active")
         .apply(lambda g: g["gini_coef"].corr(g["median_price"]))
         .dropna()
         .reset_index(name="corr")
         .sort_values("years_active")
    )

    # 3 draw the line plot
    with plot_out:
        plot_out.clear_output(wait=True)
        if corr_by_year.empty:
            print("No data after filters.")
            return
        fig = px.line(
            corr_by_year, x="years_active", y="corr",
            markers=True,
            title="Correlation (gini coef ↔ median price) by years active"
        )
        fig.update_layout(
            yaxis_title="Correlation [-1,1]", xaxis_title="Years active",
            yaxis=dict(range=[-1, 1])
        )
        fig.show()

# watch every control
for widg in (foal_min, year_range, sire_multiselect):
    widg.observe(redraw, names="value")

# initial draw
# redraw()

ui = w.VBox([
        w.HBox([foal_min, year_range]),
        plot_out
     ])

display(ui)                 # still shows in the notebook

VBox(children=(HBox(children=(IntText(value=10, description='Foal ≥', layout=Layout(width='150px')), IntRangeS…

___