In [7]:
import numpy as np
import pandas as pd
from pydantic import BaseModel

In [8]:
def highlight_greaterthan(value, threshold, color='yellow'):
    """Highlights cells greater than a threshold value."""
    if isinstance(value, (int, float)) and value >= threshold:
        return f'background-color: {color}'
    return ''

def highlight_series(series, idx, color="green"):
    return (
        series
        .to_frame()
        .T
        .style
        .map(lambda x: highlight_greaterthan(x, series[idx], color))
    )

# --------------------------------------------------------------------------------------
class Span(BaseModel):
    low: int
    high: int
    
def get_split(span: Span):
    n = span.high - span.low + 1
    return span.low + n // 2
    
# --------------------------------------------------------------------------------------
def check_split(a, b, a_split, b_split, **kwargs):
    # looking at split in 'a' ...
    lower_split = bool(a[a_split - 1] > b[b_split])
    raise_split = bool(b[b_split - 1] > a[a_split])
    
    is_valid = not (lower_split or raise_split)
    
    return dict(is_valid=is_valid, lower_split=lower_split, raise_split=raise_split)

def estimate_splits(a, b, a_split, b_split, a_span, status):
    if status["lower_split"]:
        new_span = Span(low=a_span.low, high=a_split)
    if status["raise_split"]:
        new_span = Span(low=a_split, high=a_span.high)
    new_a_split = get_split(new_span)
    new_b_split = b_split - (new_a_split - a_split)
    return dict(
        a_split = new_a_split, 
        b_split = new_b_split,
        a_span = new_span,
    )

# --------------------------------------------------------------------------------------
def calculate_median(a, b, est):
    n = len(a) + len(b)
    a_split = est["a_split"]
    b_split = est["b_split"]
    if n % 2 == 0:
        max_left = max(a[a_split - 1], b[b_split - 1])
        min_right = min(a[a_split], b[b_split])
        med = (max_left + min_right) / 2
    else:
        med = min(a[a_split], b[b_split])
    return med

---

In [3]:
na = np.random.randint(50, high=60)
nb = np.random.randint(60, high=70)
n = na + nb

a_range = Span(low=0, high=na - 1)
b_range = Span(low=0, high=nb - 1)
combined_range = Span(low=0, high=n - 1)

split_index = get_split(combined_range)

a_split = get_split(a_range)
b_split = split_index - a_split

print(f"{na}, {nb}, {n}")
a_split, b_split, split_index

54, 61, 115


(27, 30, 57)

In [11]:
a = (
    pd.Series(np.random.randint(0, high=100, size=na))
    .sort_values()
    .reset_index(drop=True)
)
b = (
    pd.Series(np.random.randint(0, high=100, size=nb))
    .sort_values()
    .reset_index(drop=True)
)
est = {"a_split": a_split, "b_split": b_split, "a_span": a_range}

- have a, b, initial_splits
- is split valid?
- yes: done
- no: estimate_splits  
      goto is split valid?

In [12]:
display(a.pipe(highlight_series, get_split(a_range), "lightgreen"))
display(b.pipe(highlight_series, get_split(b_range), "lightblue"))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
0,1,2,3,7,8,10,10,13,14,14,16,17,23,31,37,37,38,40,40,42,44,50,52,53,54,55,56,58,60,63,64,64,67,67,67,68,69,70,71,73,73,74,76,78,79,79,83,85,89,92,95,96,98,98


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,0,1,5,5,7,7,9,10,14,17,18,21,23,29,31,32,34,36,39,40,41,42,44,44,45,45,47,51,52,55,55,58,59,60,61,62,65,65,65,69,69,69,72,72,72,73,74,74,76,78,84,85,85,85,87,87,90,93,94,94,96


In [13]:
c = (
    pd.concat([a, b], ignore_index=True)
    .sort_values()
    .reset_index(drop=True)
)
display(c.pipe(highlight_series, split_index, "aqua"))
med_from_sorted = c.median()
print(f"median = {med_from_sorted}")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114
0,0,1,1,2,3,5,5,7,7,7,8,9,10,10,10,13,14,14,14,16,17,17,18,21,23,23,29,31,31,32,34,36,37,37,38,39,40,40,40,41,42,42,44,44,44,45,45,47,50,51,52,52,53,54,55,55,55,56,58,58,59,60,60,61,62,63,64,64,65,65,65,67,67,67,68,69,69,69,69,70,71,72,72,72,73,73,73,74,74,74,76,76,78,78,79,79,83,84,85,85,85,85,87,87,89,90,92,93,94,94,95,96,96,98,98


median = 56.0


In [None]:
split_status = check_split(a, b, **est)
split_status

In [None]:
print(est)
print(split_status)
print("-" * 100)

while not split_status["is_valid"]:
    est = estimate_splits(a, b, **est, status=split_status)
    print(est)
    split_status = check_split(a, b, **est)
    print(split_status)
    print("-" * 100)
    
display(a.pipe(highlight_series, est["a_split"], "lightgreen"))
display(b.pipe(highlight_series, est["b_split"], "lightblue"))

In [None]:
med_from_algo = calculate_median(a, b, est)
print(f"median: {med_from_algo} == {med_from_sorted} -> {med_from_algo == med_from_sorted}")

First check:
1) $\max(A_{lower}) \le \min(B_{upper})$
1) $\max(B_{lower}) \le \min(A_{upper})$

if (1) and (2) and n is even then median = 
$\text{med}(\text{sorted}(\max(A_{lower}), \max(B_{lower}), \min(A_{upper}), \min(B_{upper}))$

if (1) and not (2):

$A_{lower}     
  \begin{cases}
    \leq A_{upper} \\
    \leq B_{upper}
  \end{cases}
$  
and  
$B_{upper}     
  \begin{cases}
    \geq A_{lower} \\
    \geq B_{lower}
  \end{cases}
$  
so we can eliminate $A_{lower}$ and $B_{upper}$ from consideration.

if (2) and not (1):

$B_{lower}     
  \begin{cases}
    \leq A_{upper} \\
    \leq B_{upper}
  \end{cases}
$  
and  
$A_{upper}     
  \begin{cases}
    \geq A_{lower} \\
    \geq B_{lower}
  \end{cases}
$  
so we can eliminate $A_{upper}$ and $B_{lower}$ from consideration.

