diff --git a/public/_headers b/public/_headers index 7baf6d9..65be62b 100644 --- a/public/_headers +++ b/public/_headers @@ -5,7 +5,7 @@ Referrer-Policy: strict-origin-when-cross-origin Permissions-Policy: camera=(), microphone=(), geolocation=(), payment=(), usb=(), vr=(), magnetometer=(), gyroscope=(), fullscreen=(self), accelerometer=() Strict-Transport-Security: max-age=31536000; includeSubDomains; preload - Content-Security-Policy: default-src 'self'; script-src 'self' https://cdn.plot.ly https://static.cloudflareinsights.com; style-src 'self' 'unsafe-inline' https://fonts.googleapis.com; img-src 'self' data: https: blob:; font-src 'self' https://fonts.gstatic.com data:; connect-src 'self' https://api.github.com https://api.rss2json.com https://cloudflareinsights.com; media-src 'self'; object-src 'none'; base-uri 'self'; form-action 'self'; frame-ancestors 'none'; upgrade-insecure-requests; block-all-mixed-content + Content-Security-Policy: default-src 'self'; script-src 'self' 'unsafe-inline' https://cdn.plot.ly https://static.cloudflareinsights.com; style-src 'self' 'unsafe-inline' https://fonts.googleapis.com; img-src 'self' data: https: blob:; font-src 'self' https://fonts.gstatic.com data:; connect-src 'self' https://api.github.com https://api.rss2json.com https://cloudflareinsights.com; media-src 'self'; object-src 'none'; base-uri 'self'; form-action 'self'; frame-ancestors 'none'; upgrade-insecure-requests; block-all-mixed-content /_astro/* Cache-Control: public, max-age=31536000, immutable diff --git a/src/components/merbench/CombinedFilters.astro b/src/components/merbench/CombinedFilters.astro index c2ccd18..ccb8a6f 100644 --- a/src/components/merbench/CombinedFilters.astro +++ b/src/components/merbench/CombinedFilters.astro @@ -677,11 +677,11 @@ const { difficulties, providers } = Astro.props; (cb as HTMLInputElement).checked = true; }); - // Update filter count and trigger change event + // Update filter count and trigger change event on all checkboxes updateFilterCount(); - if (checkboxes.length > 0) { - checkboxes[0].dispatchEvent(new Event('change', { bubbles: true })); - } + checkboxes.forEach((cb) => { + cb.dispatchEvent(new Event('change', { bubbles: true })); + }); }); } @@ -692,11 +692,11 @@ const { difficulties, providers } = Astro.props; (cb as HTMLInputElement).checked = false; }); - // Update filter count and trigger change event + // Update filter count and trigger change event on all checkboxes updateFilterCount(); - if (checkboxes.length > 0) { - checkboxes[0].dispatchEvent(new Event('change', { bubbles: true })); - } + checkboxes.forEach((cb) => { + cb.dispatchEvent(new Event('change', { bubbles: true })); + }); }); } diff --git a/src/components/merbench/LeaderboardTable.astro b/src/components/merbench/LeaderboardTable.astro index 313ae0f..155498f 100644 --- a/src/components/merbench/LeaderboardTable.astro +++ b/src/components/merbench/LeaderboardTable.astro @@ -7,22 +7,47 @@ export interface Props { } const { leaderboard } = Astro.props; + +// Calculate cost range for progress bar normalization +const costs = leaderboard.map((entry) => entry.Avg_Cost || calculateCost(entry.Avg_Tokens)); +const minCost = Math.min(...costs); +const maxCost = Math.max(...costs); +const costRange = maxCost - minCost; ---

Model Leaderboard

- +
- - - - - - - + + + + + + + @@ -46,7 +71,17 @@ const { leaderboard } = Astro.props; {entry.Success_Rate.toFixed(1)}% - + @@ -163,6 +198,42 @@ const { leaderboard } = Astro.props; letter-spacing: 0.5px; } + /* Sortable header styles */ + .sortable { + cursor: pointer; + user-select: none; + position: relative; + transition: background-color 0.2s ease; + } + + .sortable:hover { + background-color: var(--bg-tertiary); + color: var(--text-primary); + } + + .sortable.active { + /* No special styling - only the arrow indicator shows active state */ + } + + .sort-indicator { + font-size: 0.8rem; + opacity: 0.5; + margin-left: 0.25rem; + } + + .sortable.active .sort-indicator { + opacity: 1; + color: var(--accent-primary); + } + + .sortable:not(.active) .sort-indicator { + opacity: 0; + } + + .sortable:hover:not(.active) .sort-indicator { + opacity: 0.3; + } + tbody tr:hover { background-color: var(--bg-primary); } @@ -265,6 +336,11 @@ const { leaderboard } = Astro.props; background-color: var(--progress-low); } + /* Cost progress bar - single muted color */ + .progress-fill--cost { + background-color: #9ca3af; /* Muted gray for all cost bars */ + } + @media (max-width: 768px) { .leaderboard-section { padding: 0.5rem; diff --git a/src/lib/merbench.ts b/src/lib/merbench.ts index 8355cb4..983ccd7 100644 --- a/src/lib/merbench.ts +++ b/src/lib/merbench.ts @@ -5,6 +5,7 @@ import type { FailureAnalysisData, ParetoData, ModelStats, + LeaderboardEntry, } from './merbench-types'; // Calculate cost per run (simplified pricing model) - DEPRECATED @@ -269,6 +270,60 @@ const calculateParetoFrontier = (data: Array<{ cost: number; Success_Rate: numbe return paretoPoints; }; +// Sorting utilities +let currentSortKey = 'Success_Rate'; +let currentSortDirection: 'asc' | 'desc' = 'desc'; + +export const sortLeaderboard = ( + data: LeaderboardEntry[], + sortKey: string, + direction: 'asc' | 'desc' +): LeaderboardEntry[] => { + const sorted = [...data].sort((a, b) => { + let aVal: any; + let bVal: any; + + // Handle special cases for cost calculation + if (sortKey === 'Avg_Cost') { + aVal = a.Avg_Cost || calculateCost(a.Avg_Tokens); + bVal = b.Avg_Cost || calculateCost(b.Avg_Tokens); + } else { + aVal = a[sortKey as keyof LeaderboardEntry]; + bVal = b[sortKey as keyof LeaderboardEntry]; + } + + // Handle null/undefined values + if (aVal == null && bVal == null) return 0; + if (aVal == null) return direction === 'asc' ? -1 : 1; + if (bVal == null) return direction === 'asc' ? 1 : -1; + + // Numeric comparison + if (typeof aVal === 'number' && typeof bVal === 'number') { + return direction === 'asc' ? aVal - bVal : bVal - aVal; + } + + // String comparison + const aStr = String(aVal).toLowerCase(); + const bStr = String(bVal).toLowerCase(); + + if (aStr < bStr) return direction === 'asc' ? -1 : 1; + if (aStr > bStr) return direction === 'asc' ? 1 : -1; + return 0; + }); + + return sorted; +}; + +export const setSortState = (sortKey: string, direction: 'asc' | 'desc'): void => { + currentSortKey = sortKey; + currentSortDirection = direction; +}; + +export const getSortState = () => ({ + key: currentSortKey, + direction: currentSortDirection, +}); + // DOM manipulation utilities export const updateSummaryStats = (filteredData: FilteredData): void => { const totalRuns = filteredData.rawData.length; @@ -303,9 +358,20 @@ export const updateLeaderboard = (filteredData: FilteredData): void => { const tbody = document.querySelector('.leaderboard-table tbody'); if (!tbody) return; + // Calculate cost range for progress bar normalization + const costs = filteredData.leaderboard.map( + (entry) => entry.Avg_Cost || calculateCost(entry.Avg_Tokens) + ); + const minCost = Math.min(...costs); + const maxCost = Math.max(...costs); + const costRange = maxCost - minCost; + tbody.innerHTML = filteredData.leaderboard - .map( - (entry, index) => ` + .map((entry, index) => { + const currentCost = entry.Avg_Cost || calculateCost(entry.Avg_Tokens); + const costWidth = costRange > 0 ? (currentCost / maxCost) * 100 : 0; + + return ` @@ -317,14 +383,19 @@ export const updateLeaderboard = (filteredData: FilteredData): void => { ${entry.Success_Rate.toFixed(1)}% - + - ` - ) + `; + }) .join(''); }; diff --git a/src/scripts/merbench-filters.ts b/src/scripts/merbench-filters.ts index 9d9bdaf..e20cd98 100644 --- a/src/scripts/merbench-filters.ts +++ b/src/scripts/merbench-filters.ts @@ -1,9 +1,5 @@ -import { - getFilteredData, - updateSummaryStats, - updateLeaderboard, - showEmptyState, -} from '../lib/merbench'; +import { getFilteredData, updateSummaryStats, showEmptyState } from '../lib/merbench'; +import { updateLeaderboardData } from './merbench-sorting'; import type { RawData, TestGroupData, MerbenchData } from '../lib/merbench-types'; import { MerbenchCharts } from './merbench-charts'; @@ -317,7 +313,8 @@ export class MerbenchFilters { private updateUI(filteredData: any): void { updateSummaryStats(filteredData); - updateLeaderboard(filteredData); + // Use sorting-aware leaderboard update instead of basic update + updateLeaderboardData(filteredData.leaderboard); } private showNoDataMessage(): void { diff --git a/src/scripts/merbench-init-csp.ts b/src/scripts/merbench-init-csp.ts index 551b0d6..66abcaa 100644 --- a/src/scripts/merbench-init-csp.ts +++ b/src/scripts/merbench-init-csp.ts @@ -1,5 +1,6 @@ import { MerbenchCharts } from './merbench-charts'; import { MerbenchFilters } from './merbench-filters'; +import { initializeLeaderboardSorting } from './merbench-sorting'; import type { MerbenchData, RawData } from '../lib/merbench-types'; declare global { @@ -82,6 +83,9 @@ async function initializeMerbench() { const filters = new MerbenchFilters(data, charts); filters.initialize(); + // Initialize leaderboard sorting + initializeLeaderboardSorting(originalData.leaderboard); + // Initialize charts with all data try { await charts.waitForPlotly(); diff --git a/src/scripts/merbench-sorting.ts b/src/scripts/merbench-sorting.ts new file mode 100644 index 0000000..a1297c1 --- /dev/null +++ b/src/scripts/merbench-sorting.ts @@ -0,0 +1,127 @@ +import { sortLeaderboard, setSortState, getSortState, calculateCost } from '../lib/merbench'; +import type { LeaderboardEntry } from '../lib/merbench-types'; + +// Global leaderboard data storage +let currentLeaderboardData: LeaderboardEntry[] = []; + +// Initialize sorting functionality +export const initializeLeaderboardSorting = (leaderboardData: LeaderboardEntry[]): void => { + currentLeaderboardData = [...leaderboardData]; + setupSortHandlers(); +}; + +// Update leaderboard data (called when filters change) +export const updateLeaderboardData = (newData: LeaderboardEntry[]): void => { + currentLeaderboardData = [...newData]; + + // Apply current sort to the new data + const sortState = getSortState(); + const sortedData = sortLeaderboard(currentLeaderboardData, sortState.key, sortState.direction); + renderLeaderboard(sortedData); +}; + +// Setup click handlers for sortable headers +const setupSortHandlers = (): void => { + const sortableHeaders = document.querySelectorAll('.sortable'); + + sortableHeaders.forEach((header) => { + header.addEventListener('click', (event) => { + const target = event.currentTarget as HTMLElement; + const sortKey = target.dataset.sortKey; + const sortType = target.dataset.sortType; + + if (!sortKey) return; + + // Determine new sort direction + const currentDirection = target.dataset.sortDirection; + let newDirection: 'asc' | 'desc'; + + if (target.classList.contains('active')) { + // Toggle direction if clicking the same column + newDirection = currentDirection === 'desc' ? 'asc' : 'desc'; + } else { + // Default direction for new column + newDirection = sortType === 'string' ? 'asc' : 'desc'; + } + + // Update sort state + setSortState(sortKey, newDirection); + + // Update UI + updateSortIndicators(sortKey, newDirection); + + // Sort and render data + const sortedData = sortLeaderboard(currentLeaderboardData, sortKey, newDirection); + renderLeaderboard(sortedData); + }); + }); +}; + +// Update visual sort indicators +const updateSortIndicators = (activeSortKey: string, direction: 'asc' | 'desc'): void => { + const sortableHeaders = document.querySelectorAll('.sortable'); + + sortableHeaders.forEach((header) => { + const element = header as HTMLElement; + const sortKey = element.dataset.sortKey; + const indicator = element.querySelector('.sort-indicator'); + + if (!indicator) return; + + if (sortKey === activeSortKey) { + // Active column + element.classList.add('active'); + element.dataset.sortDirection = direction; + indicator.textContent = direction === 'desc' ? '↓' : '↑'; + } else { + // Inactive columns + element.classList.remove('active'); + element.removeAttribute('data-sort-direction'); + indicator.textContent = ''; + } + }); +}; + +// Render the leaderboard table +const renderLeaderboard = (data: LeaderboardEntry[]): void => { + const tbody = document.querySelector('#leaderboard-table tbody'); + if (!tbody) return; + + // Calculate cost range for progress bar normalization + const costs = data.map((entry) => entry.Avg_Cost || calculateCost(entry.Avg_Tokens)); + const minCost = Math.min(...costs); + const maxCost = Math.max(...costs); + const costRange = maxCost - minCost; + + tbody.innerHTML = data + .map((entry, index) => { + const currentCost = entry.Avg_Cost || calculateCost(entry.Avg_Tokens); + const costWidth = costRange > 0 ? (currentCost / maxCost) * 100 : 0; + + return ` + + + + + + + + + + + `; + }) + .join(''); +};
RankModelSuccess RateAvg Cost/RunAvg DurationAvg TokensRunsProvider + Model + + Success Rate + + Avg Cost/Run + + Avg Duration + + Avg Tokens + + Runs + + Provider +
${calculateCost(entry.Avg_Tokens).toFixed(4)} +
+
0 ? ((entry.Avg_Cost || calculateCost(entry.Avg_Tokens)) / maxCost) * 100 : 0}%`} + /> + + ${(entry.Avg_Cost || calculateCost(entry.Avg_Tokens)).toFixed(4)} + +
+
{entry.Avg_Duration.toFixed(2)}s {entry.Avg_Tokens.toLocaleString()} {entry.Runs}
${index + 1} ${entry.Model} $${(entry.Avg_Cost || calculateCost(entry.Avg_Tokens)).toFixed(4)} +
+
+ $${currentCost.toFixed(4)} +
+
${entry.Avg_Duration.toFixed(2)}s ${entry.Avg_Tokens.toLocaleString()} ${entry.Runs} ${entry.Provider}
${index + 1}${entry.Model} +
+
+ ${entry.Success_Rate.toFixed(1)}% +
+
+
+
+ $${currentCost.toFixed(4)} +
+
${entry.Avg_Duration.toFixed(2)}s${entry.Avg_Tokens.toLocaleString()}${entry.Runs}${entry.Provider}