In [1]:
import cfbd
import requests
import json
from itertools import islice
import time
from cfbd.rest import ApiException
from pprint import pprint
import sys, subprocess
import networkx as nx
import datetime
from config import API_KEY

In [2]:
# access api

configuration = cfbd.Configuration(
    access_token = API_KEY
)

In [None]:
# getting team data to cross ref
years = [2021, 2022, 2023, 2024, 2025]

# This is the only dictionary you need to create here.
node_attributes_by_year = {}

for year in years:
    print(f"Fetching all team data for attributes for {year}...")
    teams_api = cfbd.TeamsApi(cfbd.ApiClient(configuration))
    all_teams = teams_api.get_teams(year=year)
    print(f"Found {len(all_teams)} teams.")
    
    current_year_attrs = {}
    for team in all_teams:
        current_year_attrs[team.school] = {
            # Use 'Unknown' as a default string if data is None
            'classification': str(team.classification) if team.classification else 'Unknown',
            'conference': str(team.conference) if team.conference else 'Unknown',
            # Use 0.0 as a default float if data is None
            'latitude': float(team.location.latitude) if team.location and team.location.latitude else 0.0,
            'longitude': float(team.location.longitude) if team.location and team.location.longitude else 0.0
        }
    node_attributes_by_year[year] = current_year_attrs

print("Node attribute maps created successfully.")

In [None]:
# getting transfer portal data

with cfbd.ApiClient(configuration) as api_client:
    api_instance = cfbd.PlayersApi(api_client)

api_response_2025 = api_instance.get_transfer_portal(year=2025)
api_response_2024 = api_instance.get_transfer_portal(year=2024)
api_response_2023 = api_instance.get_transfer_portal(year=2023)
api_response_2022 = api_instance.get_transfer_portal(year=2022)
api_response_2021 = api_instance.get_transfer_portal(year=2021)


In [None]:
# convert to graphml

data_by_year = {
    2025: api_response_2025,
    2024: api_response_2024,
    2023: api_response_2023,
    2022: api_response_2022,
    2021: api_response_2021
}

node_attr_map = node_attributes_by_year

default_attrs = {
    'classification': 'Unknown',
    'conference': 'Unknown',
    'latitude': 0.0,
    'longitude': 0.0
}

# Add nodes and edges to the graph G based on data
# Nodes: school names (origin and destination)
# Edges: a directed edge origin -> destination per player; edge attributes aggregate players
for year, data in data_by_year.items():
    G = nx.DiGraph()
    print(f"Processing data for {year}...")
    for t in data:
        origin = t.origin.strip() if getattr(t, 'origin', None) else None
        dest = t.destination.strip() if getattr(t, 'destination', None) else None

        # add nodes if present
        if origin:
            G.add_node(origin)
        if dest:
            G.add_node(dest)

        # only create edges when both origin and destination exist
        if origin and dest:
            player = f"{getattr(t, 'first_name', '')} {getattr(t, 'last_name', '')}".strip()
            pos = getattr(t, 'position', None)
            date = getattr(t, 'transfer_date', None)
            date_iso = date.isoformat() if date is not None else None
            rating = getattr(t, 'rating', None)
            stars = getattr(t, 'stars', None)
            eligibility = getattr(t, 'eligibility', None)

            if G.has_edge(origin, dest):
                edge = G[origin][dest]
                edge.setdefault('players', []).append(player)
                edge.setdefault('positions', []).append(pos)
                edge.setdefault('dates', []).append(date_iso)
                edge.setdefault('ratings', []).append(rating)
                edge.setdefault('stars', []).append(stars)
                edge.setdefault('eligibility', []).append(str(eligibility))
                edge['weight'] = edge.get('weight', 1) + 1
            else:
                G.add_edge(origin, dest, players=[player], positions=[pos], dates=[date_iso], ratings=[rating], stars=[stars], eligibility=[str(eligibility)], weight=1)


    # Serialize any list-valued (or None) edge attributes to strings for GraphML compatibility
    for u, v, attrs in G.edges(data=True):
        for k in list(attrs.keys()):
            val = attrs[k]
            if isinstance(val, list):
                # join list elements into a single string; convert None -> empty string
                attrs[k] = ' | '.join(['' if x is None else str(x) for x in val])
            elif val is None:
                attrs[k] = ''

    #Get the map of attributes for the specific year
    current_year_node_attrs = node_attr_map.get(year, {})

    # Create the final mapping, applying defaults to any node not in the map
    final_attrs_for_graph = {
        node: current_year_node_attrs.get(node, default_attrs) for node in G.nodes()
    }

    # Set all attributes at once. 
    # NetworkX unpacks the inner dictionaries automatically.
    nx.set_node_attributes(G, final_attrs_for_graph)

    filename = f"transfer_portal_{year}.graphml"

    # write graphml
    nx.write_graphml(G, filename)
    print(f"Wrote {len(G.nodes())} nodes and {len(G.edges())} edges to {filename}")

In [None]:
# ============================================
# PART 1: RECRUITING GRAPHS WITH SCHOOL ATTRIBUTES
# ============================================

print("\n" + "="*70)
print("PART 1: COLLECTING RECRUITING DATA")
print("="*70 + "\n")

# First, collect school attributes for all years
print("Collecting school attributes for all years...")
school_attrs_by_year = {}

# assigns year as the key and year_attrs as the value, which is itself a dict with a value that is also another dict
for year in range(2000, 2027):
    print(f"  Fetching school data for {year}...")
    try:
        teams_api = cfbd.TeamsApi(cfbd.ApiClient(configuration))
        teams = teams_api.get_teams(year=year)
        
        year_attrs = {}
        for team in teams:
            year_attrs[team.school] = {
                'classification': str(team.classification) if team.classification else 'Unknown',
                'conference': str(team.conference) if team.conference else 'Unknown',
                'latitude': float(team.location.latitude) if (team.location and team.location.latitude) else 0.0,
                'longitude': float(team.location.longitude) if (team.location and team.location.longitude) else 0.0
            }
        school_attrs_by_year[year] = year_attrs
        time.sleep(1)  # Rate limit protection
        
    except ApiException as e:
        print(f"    Error fetching teams for {year}: {e}")
        school_attrs_by_year[year] = {}

# Now build recruiting graphs
print("\nBuilding recruiting graphs...")

for year in range(2000, 2027):
    print(f"\n  Processing recruiting class {year}...")
    
    try:
        recruit_api = cfbd.RecruitingApi(cfbd.ApiClient(configuration))
        recruits = recruit_api.get_recruits(year=year)
        
        G = nx.MultiGraph()
        
        # Track node attributes
        hometown_node_attrs = {}
        school_node_attrs = {}
        
        for recruit in recruits:
            school = getattr(recruit, 'committed_to', None)
            city = getattr(recruit, 'city', None)
            state = getattr(recruit, 'state_province', None)
            hometown_info = getattr(recruit, 'hometown_info', None)
            
            # Build hometown key
            hometown_key = None
            if city and state:
                hometown_key = f"{city}, {state}"
            
            if not school or not hometown_key:
                continue
            
            # Add school node
            if school not in G:
                G.add_node(school, bipartite=0, type='School')
                # Get school attributes from our pre-fetched data
                school_node_attrs[school] = school_attrs_by_year.get(year, {}).get(
                    school,
                    {'classification': 'Unknown', 'conference': 'Unknown', 'latitude': 0.0, 'longitude': 0.0}
                )
            
            # Add hometown node
            if hometown_key not in G:
                G.add_node(hometown_key, bipartite=1, type='Hometown')
                hometown_node_attrs[hometown_key] = {
                    'latitude': float(hometown_info.latitude) if (hometown_info and hometown_info.latitude) else 0.0,
                    'longitude': float(hometown_info.longitude) if (hometown_info and hometown_info.longitude) else 0.0,
                    'fips': int(hometown_info.fips_code) if (hometown_info and hometown_info.fips_code) else 0,
                    'city': city,
                    'state': state
                }
            
            id = getattr(recruit, 'id', 'Unknown')
            athlete_id = getattr(recruit, 'athlete_id', 'Unknown')
            name = getattr(recruit, 'name', 'Unknown')
            position = getattr(recruit, 'position', 'N/A')
            rating = getattr(recruit, 'rating', 0.0)
            stars = getattr(recruit, 'stars', 0)
            type = str(getattr(recruit, 'recruit_type', 'N/A')) if getattr(recruit, 'recruit_type', None) else 'N/A'
            
            G.add_edge(
                hometown_key,
                school,
                id=str(id),
                athlete_id=str(athlete_id),
                player=name,
                position=position,
                rating=rating,
                stars=stars,
                recruit_type=type,
                weight=1
            )
        
        # Set node attributes
        nx.set_node_attributes(G, school_node_attrs)
        nx.set_node_attributes(G, hometown_node_attrs)
        
        # Serialize attributes for GraphML
        # We need keys=True to handle MultiGraph edges correctly
        for u, v, key, attrs in G.edges(data=True, keys=True):
            for k, val in attrs.items():
                # We no longer have lists, so we just check for None
                if val is None:
                    attrs[k] = ''
        
        # Save graph
        filename = RECRUITING_DIR / f"recruiting_network_{year}.graphml"
        nx.write_graphml(G, filename)
        print(f"    ✓ Saved {len(G.nodes())} nodes, {len(G.edges())} edges")
        
        time.sleep(1)  # Rate limit protection
        
    except ApiException as e:
        print(f"    Error processing {year}: {e}")

print("\n✓ Recruiting graphs complete!")

In [11]:
# redoing recruit data

import cfbd
import pandas as pd
import networkx as nx
import datetime
from pathlib import Path
import time
from cfbd.rest import ApiException
from config import API_KEY

# Initialize API configuration
configuration = cfbd.Configuration(access_token=API_KEY)

# Create output directories
OUTPUT_DIR = Path("./data")
RECRUITING_DIR = OUTPUT_DIR / "recruiting"
PERFORMANCE_DIR = OUTPUT_DIR / "supplemental"
RECRUITING_DIR.mkdir(parents=True, exist_ok=True)
PERFORMANCE_DIR.mkdir(parents=True, exist_ok=True)

print("="*70)
print("CFB DATA COLLECTION PIPELINE")
print("="*70)

# ============================================
# PART 2: PLAYER PERFORMANCE DATA
# ============================================

print("\n" + "="*70)
print("PART 2: COLLECTING PLAYER PERFORMANCE DATA")
print("="*70 + "\n")

all_performance_data = []

for year in range(2013, 2025):
    print(f"  Fetching performance data for {year}...")
    
    try:
        performance_api = cfbd.MetricsApi(cfbd.ApiClient(configuration))
        performance_data = performance_api.get_predicted_points_added_by_player_season(year=year)
        
        for player in performance_data:
            avg_obj = getattr(player, 'average_ppa', None)
            tot_obj = getattr(player, 'total_ppa', None)
            all_performance_data.append({
                'player_id': getattr(player, 'id', None),
                'season': getattr(player, 'season', year),
                'name': getattr(player, 'name', None),
                'team': getattr(player, 'team', None),
                'position': getattr(player, 'position', None),
                'conference': getattr(player, 'conference', None),
                'averagePPA_all': getattr(avg_obj, 'all', None) if avg_obj else None,
                'totalPPA_all': getattr(tot_obj, 'all', None) if tot_obj else None,
            })
        
        print(f"    ✓ Collected {len(performance_data)} player records")
        time.sleep(1)  # Rate limit protection
        
    except ApiException as e:
        print(f"    Error fetching performance for {year}: {e}")

# Save to CSV
if all_performance_data:
    df_performance = pd.DataFrame(all_performance_data)
    output_file = PERFORMANCE_DIR / f"player_performance.csv"
    df_performance.to_csv(output_file, index=False)
    print(f"\n✓ Saved {len(df_performance)} performance records to {output_file}")
else:
    print("\n⚠ No performance data collected")

# ============================================
# PART 3: TEAM ROSTERS
# ============================================

print("\n" + "="*70)
print("PART 3: COLLECTING TEAM ROSTERS")
print("="*70 + "\n")

all_roster_data = []

for year in range(2000, 2025):
    print(f"  Fetching roster data for {year}...")
    
    try:
        teams_api = cfbd.TeamsApi(cfbd.ApiClient(configuration))
        roster_data = teams_api.get_roster(year=year)
        
        for player in roster_data:
            fname = getattr(player, 'first_name', None)
            lname = getattr(player, 'last_name', None)
            if fname and lname:
                name = f"{fname} {lname}"
            else:
                name = fname or lname or None
            all_roster_data.append({
                'player_id': getattr(player, 'id', None),
                'name': name,
                'team': getattr(player, 'team', None),
                'position': getattr(player, 'position', None),
                'year': year,
                'recruit_ids': ','.join(getattr(player, 'recruit_ids', [])) if getattr(player, 'recruit_ids', None) else None,
            })

        print(f"    ✓ Collected {len(roster_data)} roster records")
        time.sleep(1)  # Rate limit protection
        
    except ApiException as e:
        print(f"    Error fetching rosters for {year}: {e}")

# Save to CSV
if all_roster_data:
    df_rosters = pd.DataFrame(all_roster_data)
    output_file = PERFORMANCE_DIR / f"rosters.csv"
    df_rosters.to_csv(output_file, index=False)
    print(f"\n✓ Saved {len(df_rosters)} roster records to {output_file}")
else:
    print("\n⚠ No roster data collected")

# ============================================
# PART 4: TEAM RATINGS (FPI + SRS)
# ============================================

print("\n" + "="*70)
print("PART 4: COLLECTING TEAM RATINGS")
print("="*70 + "\n")

all_ratings_data = []

for year in range(2000, 2025):
    print(f"  Fetching ratings for {year}...")
    
    # Fetch FPI
    fpi_data = {}
    try:
        ratings_api = cfbd.RatingsApi(cfbd.ApiClient(configuration))
        fpi_ratings = ratings_api.get_fpi(year=year)

        for rating in fpi_ratings:
            team = getattr(rating, 'team', None)
            fpi_value = getattr(rating, 'fpi', None)
            conference = getattr(rating, 'conference', None)
            if team:
                fpi_data[team] = {'fpi': fpi_value, 'conference': conference}
        
        print(f"    ✓ Collected FPI for {len(fpi_data)} teams")
        time.sleep(1)
        
    except ApiException as e:
        print(f"    Error fetching FPI for {year}: {e}")
    
    # Fetch SRS
    srs_data = {}
    try:
        srs_ratings = ratings_api.get_srs(year=year)
        
        for rating in srs_ratings:
            team = getattr(rating, 'team', None)
            srs_value = getattr(rating, 'rating', None)
            if team:
                srs_data[team] = srs_value
        
        print(f"    ✓ Collected SRS for {len(srs_data)} teams")
        time.sleep(1)
        
    except ApiException as e:
        print(f"    Error fetching SRS for {year}: {e}")
    
    # Merge FPI and SRS
    all_teams = set(list(fpi_data.keys()) + list(srs_data.keys()))
    for team in all_teams:
        all_ratings_data.append({
            'school': team,
            'year': year,
            'fpi': fpi_data.get(team, {}).get('fpi', None),
            'srs': srs_data.get(team, None),
            'conference': fpi_data.get(team, {}).get('conference', None)
        })

# Save to CSV
if all_ratings_data:
    df_ratings = pd.DataFrame(all_ratings_data)
    output_file = PERFORMANCE_DIR / f"team_performance.csv"
    df_ratings.to_csv(output_file, index=False)
    print(f"\n✓ Saved {len(df_ratings)} rating records to {output_file}")
else:
    print("\n⚠ No ratings data collected")

# ============================================
# SUMMARY
# ============================================

print("\n" + "="*70)
print("DATA COLLECTION COMPLETE!")
print("="*70)
print(f"\nRecruiting graphs: {RECRUITING_DIR}")
print(f"Performance data: {PERFORMANCE_DIR}")
print("\nFiles created:")
print(f"  - player_performance.csv")
print(f"  - rosters.csv")
print(f"  - team_performance.csv")
print("="*70)

CFB DATA COLLECTION PIPELINE

PART 2: COLLECTING PLAYER PERFORMANCE DATA

  Fetching performance data for 2013...
    ✓ Collected 2984 player records
  Fetching performance data for 2014...
    ✓ Collected 3052 player records
  Fetching performance data for 2015...
    ✓ Collected 3109 player records
  Fetching performance data for 2016...
    ✓ Collected 3252 player records
  Fetching performance data for 2017...
    ✓ Collected 3229 player records
  Fetching performance data for 2018...
    ✓ Collected 3490 player records
  Fetching performance data for 2019...
    ✓ Collected 3506 player records
  Fetching performance data for 2020...
    ✓ Collected 2381 player records
  Fetching performance data for 2021...
    ✓ Collected 3303 player records
  Fetching performance data for 2022...
    ✓ Collected 4992 player records
  Fetching performance data for 2023...
    ✓ Collected 4585 player records
  Fetching performance data for 2024...
    ✓ Collected 4129 player records

✓ Saved 42012

In [12]:
with cfbd.ApiClient(configuration) as api_client:
    info = cfbd.InfoApi(api_client)

info.get_user_info()

UserInfo(patron_level=0, remaining_calls=770)