In [1]:
from utils.helpers import get_league_urls,fetch_and_map_league, fetch_all_league_seasons, get_athlete_info, fetch_events_for_league_season, process_event, get_odds_data, get_stat_data

from db import add_to_db, League, Athlete, Card, Fight, Odds, StatisticsForFight
import httpx
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from typing import List, Dict, Tuple
import sqlite3




In [2]:
league_urls = get_league_urls()

### Grab leagues and seasons per league

In [3]:
mappings = []
all_season_urls = []
with ThreadPoolExecutor(max_workers=20) as executor:
    futures = {executor.submit(fetch_and_map_league, url): url for url in league_urls}
    for future in as_completed(futures):
        url = futures[future]
        try:
            league_mapping, season_url = future.result()
            
            mappings.append(league_mapping)
            all_season_urls.append(season_url)
        except Exception as e:
            print(f"⚠️ Failed to fetch/map {url}: {e}")

In [4]:
add_to_db(mappings, League)

Successfully added 47 out of 47 records to leagues


### Fetch athlete info

In [5]:
# ─── 1) Configuration ──────────────────────────────────────────────────────────
BASE_URL = "https://sports.core.api.espn.com/v2/sports/mma/athletes"
COMMON_PARAMS = {"lang": "en", "region": "us", "limit": 1000}
MAX_WORKERS = 10  # adjust up/down based on your bandwidth and the server’s rate limits

# ─── 2) Fetch page count ───────────────────────────────────────────────────────
resp = httpx.get(BASE_URL, params=COMMON_PARAMS)
resp.raise_for_status()
page_count = resp.json().get("pageCount", 0)
print(f"→ Detected {page_count} pages")

# ─── 3) Define a page-fetch helper ─────────────────────────────────────────────
def fetch_page(pg: int):
    params = {**COMMON_PARAMS, "page": pg}
    r = httpx.get(BASE_URL, params=params)
    r.raise_for_status()
    items = r.json().get("items", [])
    return [item.get("$ref") for item in items]

# ─── 4) Dispatch threads & collect results ─────────────────────────────────────
all_athlete_urls = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit one future per page
    futures = {executor.submit(fetch_page, pg): pg for pg in range(1, page_count + 1)}
    for future in as_completed(futures):
        pg = futures[future]
        try:
            refs = future.result()
            all_athlete_urls.extend(refs)
        except Exception as e:
            print(f"❌ Page {pg} failed: {e!r}")

print(f"✔ Retrieved {len(all_athlete_urls)} athlete URLs")

→ Detected 37 pages
✔ Retrieved 36049 athlete URLs


In [6]:
len(all_athlete_urls)

36049

In [10]:
# all_athlete_urls

with ThreadPoolExecutor(max_workers=150) as executor:
    futures = {executor.submit(get_athlete_info, url): url for url in all_athlete_urls[30000:]}
    for future in as_completed(futures):
        url = futures[future]
        try:
            athlete_data = future.result()
            add_to_db(athlete_data, Athlete)
        except Exception as e:
            print(f"Error fetching athlete data for {url}: {e}")
    

### Find Card Info

In [4]:
league_seasons = fetch_all_league_seasons(all_season_urls)

In [5]:
def get_all_event_urls(
    league_seasons: List[Dict[str, List[str]]],
    max_workers: int = 20
) -> List[str]:
    """
    Given your league_seasons list, spawn threads to fetch each league/season.
    Returns one flat list of all event URLs.
    """
    # 1) Flatten to a list of (league, season) tuples
    tasks: List[Tuple[str,str]] = []
    for mapping in league_seasons:
        for league, seasons in mapping.items():
            for season in seasons:
                tasks.append((league, season))

    all_event_urls: List[str] = []
    # 2) Thread pool
    with ThreadPoolExecutor(
        max_workers=min(max_workers, len(tasks))
    ) as executor:
        future_to_task = {
            executor.submit(fetch_events_for_league_season, league, season): (league, season)
            for league, season in tasks
        }
        # 3) Collect results as they come in
        for future in as_completed(future_to_task):
            league, season = future_to_task[future]
            try:
                urls = future.result()
                all_event_urls.extend(urls)
            except Exception as exc:
                # Log but don’t kill the whole run
                print(f"⚠️ Error fetching {league} season {season}: {exc!r}")

    return all_event_urls


# Suppose league_seasons is defined already
event_urls = get_all_event_urls(league_seasons, max_workers=25)


Fetched 913 events for other 2022
Fetched 982 events for other 2008
Fetched 926 events for other 2007
Fetched 1305 events for other 2018
Fetched 1268 events for other 2019
Fetched 1589 events for other 2016Fetched 1479 events for other 2017

Fetched 1615 events for other 2012
Fetched 1547 events for other 2014
Fetched 1590 events for other 2015
Fetched 1467 events for other 2011
Fetched 1228 events for other 2009
Fetched 1660 events for other 2013
Fetched 1370 events for other 2010


In [7]:
len(event_urls)

28145

In [7]:
ufc_urls = [x for x in event_urls if 'ufc' in x]

In [9]:
card_mappings = []
event_mappings = []
max_workers = 150

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_event, u): u for u in event_urls[5000:]}
    for fut in as_completed(futures):
        url = futures[fut]
        try:
            card, evs = fut.result()
            if card:
                # card_mappings.append(card)
                add_to_db(card, Card)
            for ev in evs:
                try:
                    # attempt to append the mapped row
                    # event_mappings.append(ev)
                    add_to_db(ev, Fight)
                except Exception as e:
                    # if something goes wrong, print out the keys you care about
                    print(f"⚠️ Mapping error for event_id={ev.get('event_id')} "
                          f"league={ev.get('league')}: {e}")
        except Exception as e:
            print(f"⚠️ process_event failed for URL {url}: {e}")

⚠️ process_event failed for URL http://sports.core.api.espn.com/v2/sports/mma/leagues/other/events/400252832?lang=en&region=us: _ssl.c:983: The handshake operation timed out
⚠️ process_event failed for URL http://sports.core.api.espn.com/v2/sports/mma/leagues/other/events/400452299?lang=en&region=us: _ssl.c:983: The handshake operation timed out
⚠️ process_event failed for URL http://sports.core.api.espn.com/v2/sports/mma/leagues/other/events/400252904?lang=en&region=us: _ssl.c:983: The handshake operation timed out
⚠️ process_event failed for URL http://sports.core.api.espn.com/v2/sports/mma/leagues/other/events/400540237?lang=en&region=us: _ssl.c:983: The handshake operation timed out
⚠️ process_event failed for URL http://sports.core.api.espn.com/v2/sports/mma/leagues/other/events/400420537?lang=en&region=us: _ssl.c:983: The handshake operation timed out
⚠️ process_event failed for URL http://sports.core.api.espn.com/v2/sports/mma/leagues/other/events/400252834?lang=en&region=us: _s

In [11]:
import sqlite3

conn = sqlite3.connect('data/mma.db')

cur = conn.cursor()
odds_urls = list(pd.read_sql_query("SELECT odds_url FROM fights WHERE odds_url IS NOT null", conn)['odds_url'].unique())
conn.close()

In [12]:

# test_url = "http://sports.core.api.espn.com/v2/sports/mma/leagues/ufc/events/600013389/competitions/401389440/odds?lang=en&region=us"
# test_url = "http://sports.core.api.espn.com/v2/sports/mma/leagues/ufc/events/600053676/competitions/401768764/odds?lang=en&region=us"
# odds_data = get_odds_data(test_url)

len(odds_urls)

3551

In [13]:

with ThreadPoolExecutor(max_workers=25) as executor:
    futures = {executor.submit(get_odds_data, url): url for url in odds_urls}
    for future in as_completed(futures):
        url = futures[future]
        try:
            odds_data = future.result()
            add_to_db(odds_data, Odds)
        except Exception as e:
            print(f"Error fetching athlete data for {url}: {e}")
    

Successfully added 1 out of 1 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 4 out of 4 records to odds
Successfully added 2 out of 2 records to odds
Successfully added 2 out of 2 records to odds
Successfully added 4 out of 4 records to odds
Successfully added 2 out of 2 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 2 out of 2 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 2 out of 2 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 2 out of 2 records to odds
Successfully added 1 out of 1 records to odds
Successfully added 2 out of 2 records to odds
Successfully added 4 out of 4 records to odds
Successfully added 2 out of 2 records to odds
Successfully added 2 out of 2 reco

In [14]:

conn = sqlite3.connect('data/mma.db')

cur = conn.cursor()
statistics = pd.read_sql_query("SELECT * FROM fights WHERE fighter_1_statistics IS NOT null OR fighter_2_statistics IS NOT null", conn)
conn.close()

In [15]:
statistics
statistics_urls = list(statistics['fighter_1_statistics'].unique()) + list(statistics['fighter_2_statistics'].unique())
statistics_urls = [url for url in statistics_urls if url is not None]

In [16]:
len(statistics_urls)

68127

In [18]:


with ThreadPoolExecutor(max_workers=150) as executor:
    futures = {executor.submit(get_stat_data, url): url for url in statistics_urls[5000:]}
    for future in as_completed(futures):
        url = futures[future]
        try:
            stat_data = future.result()
            add_to_db(stat_data, StatisticsForFight)
        except Exception as e:
            print(f"Error fetching athlete data for {url}: {e}")


Skipping duplicate record in statistics_for_fights: (sqlite3.IntegrityError) UNIQUE constraint failed: statistics_for_fights.event_competition_athlete_id
[SQL: INSERT INTO statistics_for_fights (event_competition_athlete_id, event_id, competition_id, athlete_id, "knockDowns", "totalStrikesAttempted", "totalStrikesLanded", "sigStrikesAttempted", "sigStrikesLanded", "sigDistanceHeadStrikesAttempted", "sigDistanceHeadStrikesLanded", "sigDistanceBodyStrikesAttempted", "sigDistanceBodyStrikesLanded", "sigDistanceLegStrikesAttempted", "sigDistanceLegStrikesLanded", "sigClinchBodyStrikesAttempted", "sigClinchBodyStrikesLanded", "sigClinchHeadStrikesAttempted", "sigClinchHeadStrikesLanded", "sigClinchLegStrikesAttempted", "sigClinchLegStrikesLanded", "sigGroundHeadStrikesAttempted", "sigGroundHeadStrikesLanded", "sigGroundBodyStrikesAttempted", "sigGroundBodyStrikesLanded", "sigGroundLegStrikesAttempted", "sigGroundLegStrikesLanded", "takedownsAttempted", "takedownsLanded", "takedownsSlams", "

In [6]:
len(statistics_urls)

22342

In [2]:
# import sqlite3

# conn = sqlite3.connect('data/mma.db')

# cur = conn.cursor()

# cur.execute("DROP TABLE IF EXISTS cards")
# conn.close()