In [6]:
import networkx as nx
import os
from pathlib import Path
import pandas as pd
import numpy as np
import math
from collections import defaultdict

In [7]:
xferpath = Path(r"C:\Users\User\Documents\cfb project\data\transferportal\raw")
recpath = Path(r"C:\Users\User\Documents\cfb project\data\recruiting\original")
xferfiles = {}
recfiles = {}

for file_path in xferpath.glob('*.graphml'):
    name = file_path.stem
    G_x = nx.read_graphml(file_path)
    xferfiles[name] = G_x

for file_path in recpath.glob('*.graphml'):
    name = file_path.stem
    G_r = nx.read_graphml(file_path)
    recfiles[name] = G_r

print(f"\nSuccessfully loaded {len(xferfiles)} files from xfer and {len(recfiles)} files from rec.")



Successfully loaded 5 files from xfer and 27 files from rec.


In [3]:
# net degree calculation

# This will store all our results
# Format: { 'year': { 'school': ratio } }
all_nd = {}

# Loop through the dictionary of graphs you just loaded
for name, G_x in xferfiles.items():
    
    # Extract the year from the filename "transfer_portal_2024"
    try:
        year = int(name.split('_')[-1])
    except:
        year = name # Fallback if the name is not as expected
        
    print(f"--- Processing {year} ---")
    
    yearly_nd = {}
    
    # Loop through every school (node) in this year's graph
    for node in G_x.nodes():
        # Get the raw in-degree and out-degree
        in_deg = G_x.in_degree(node)
        out_deg = G_x.out_degree(node)
        
        if in_deg != 0 or out_deg != 0:
            nd = in_deg - out_deg
        else:
            nd = 0
        
        yearly_nd[node] = nd
        
    # Add this year's results to the main dictionary
    all_nd[year] = yearly_nd

# --- Convert to a Pandas DataFrame for easy analysis ---
# Rows will be years, Columns will be schools
netdeg_df = pd.DataFrame.from_dict(all_nd, orient='index')

# Sort by year (index) just in case
netdeg_df.sort_index(inplace=True)

print("\n--- Sample of Net Degrees (DataFrame) ---")
# Display the last 5 rows (most recent years)
print(netdeg_df.tail())

--- Processing 2021 ---
--- Processing 2022 ---
--- Processing 2023 ---
--- Processing 2024 ---
--- Processing 2025 ---

--- Sample of Net Degrees (DataFrame) ---
      Missouri  UTSA  Arkansas State  Southern Miss  Miami  Wyoming  \
2021       -10     6              11              1     -1        0   
2022         1     4              -1              0     -5       -8   
2023        -5     4               5              5     -3       -3   
2024         2     5               5              7     -6       -3   
2025         1     2               6             -2      3        1   

      Eastern Michigan  Vanderbilt  Tulsa  Missouri Western State  ...  \
2021                -1          -6      2                    -1.0  ...   
2022                 1         -10      4                    -1.0  ...   
2023                 2          -6      0                     NaN  ...   
2024                 3          -1     -2                     1.0  ...   
2025                -4           2      

In [None]:
# Top 10 teams in 2024
print(netdeg_df.loc[2024].sort_values(ascending=False).head(10))

James Madison        16.0
New Mexico           15.0
Florida A&M          14.0
Memphis              12.0
Charlotte            11.0
McNeese              11.0
Massachusetts        10.0
Stephen F. Austin     9.0
Murray State          9.0
Incarnate Word        9.0
Name: 2024, dtype: float64


In [8]:
# split multi-player edges and impute missing ratings

# ============================================
# STEP 1: ANALYZE CURRENT STATE
# ============================================

print("="*60)
print("ANALYZING CURRENT DATA STATE")
print("="*60 + "\n")

for name, graph in xferfiles.items():
    total_edges = len(graph.edges())
    total_players = 0
    multi_player_edges = 0
    
    for s, t, data in graph.edges(data=True):
        players_str = data.get('players', '')
        if players_str:
            player_list = [p.strip() for p in players_str.split('|')]
            total_players += len(player_list)
            if len(player_list) > 1:
                multi_player_edges += 1
    
    print(f"{name}:")
    print(f"  Edges: {total_edges}")
    print(f"  Players: {total_players}")
    print(f"  Multi-player edges: {multi_player_edges}")
    print()

# ============================================
# STEP 2: SPLIT EDGES BY PLAYER
# ============================================

print("="*60)
print("SPLITTING MULTI-PLAYER EDGES")
print("="*60 + "\n")

def are_same_person(name1, name2):
    """
    Returns True if two players share the same last name, 
    ignoring suffixes like Jr, Sr, III.
    """
    if not name1 or not name2:
        return False

    def get_cleaned_last_name(full_name):
        # 1. Split name into parts and lowercase
        parts = full_name.strip().lower().split()
        
        if not parts:
            return ""

        # 2. Define suffixes to ignore
        suffixes = {'jr', 'jr.', 'sr', 'sr.', 'ii', 'iii', 'iv', 'v'}

        # 3. If the last part is a suffix, remove it
        # (We use a while loop in case someone is "Name Jr. III")
        while len(parts) > 1 and parts[-1] in suffixes:
            parts.pop()
            
        # 4. Return the last remaining part
        return parts[-1]

    return get_cleaned_last_name(name1) == get_cleaned_last_name(name2)

cleaned_graphs = {}

for name, graph in xferfiles.items():
    print(f"Processing {name}...")
    
    # Create new graph
    G_new = nx.DiGraph()
    
    # Copy node attributes
    for node, attrs in graph.nodes(data=True):
        G_new.add_node(node, **attrs)
    
    edges_before = len(graph.edges())
    edges_after = 0
    players_processed = 0
    duplicates_merged = 0
    
    # Process each edge
    for source, target, data in graph.edges(data=True):
        # Extract all the pipe-separated attributes
        players_str = data.get('players', '')
        positions_str = data.get('positions', '')
        dates_str = data.get('dates', '')
        ratings_str = data.get('ratings', '')
        stars_str = data.get('stars', '')
        eligibility_str = data.get('eligibility', '')
        
        if not players_str:
            continue
        
        # Split all attributes by pipe
        players = [p.strip() for p in players_str.split('|')]
        positions = [p.strip() for p in positions_str.split('|')] if positions_str else [''] * len(players)
        dates = [d.strip() for d in dates_str.split('|')] if dates_str else [''] * len(players)
        ratings = [r.strip() for r in ratings_str.split('|')] if ratings_str else [''] * len(players)
        stars = [s.strip() for s in stars_str.split('|')] if stars_str else [''] * len(players)
        eligibility = [e.strip() for e in eligibility_str.split('|')] if eligibility_str else [''] * len(players)
        
        # Pad shorter lists to match players list length
        max_len = len(players)
        positions += [''] * (max_len - len(positions))
        dates += [''] * (max_len - len(dates))
        ratings += [''] * (max_len - len(ratings))
        stars += [''] * (max_len - len(stars))
        eligibility += [''] * (max_len - len(eligibility))
        
        # Group players by unique identity
        player_groups = defaultdict(list)
        
        for i, player in enumerate(players):
            players_processed += 1
            
            # Check if this player is already in our groups
            merged = False
            for existing_player in list(player_groups.keys()):
                if are_same_person(player, existing_player):
                    # Merge with existing entry
                    player_groups[existing_player].append({
                        'player': player,
                        'position': positions[i],
                        'date': dates[i],
                        'rating': ratings[i],
                        'stars': stars[i],
                        'eligibility': eligibility[i]
                    })
                    duplicates_merged += 1
                    merged = True
                    break
            
            if not merged:
                # Create new entry
                player_groups[player].append({
                    'player': player,
                    'position': positions[i],
                    'date': dates[i],
                    'rating': ratings[i],
                    'stars': stars[i],
                    'eligibility': eligibility[i]
                })
        
        # Create one edge per unique player
        for player_name, player_data_list in player_groups.items():
            # For duplicate entries, take the first non-empty value for each attribute
            def get_best_value(attr_name):
                values = [pd[attr_name] for pd in player_data_list]
                for v in values:
                    if v and v != '' and v != 'None':
                        return v
                return ''
            
            final_data = {
                'players': player_name,
                'positions': get_best_value('position'),
                'dates': get_best_value('date'),
                'ratings': get_best_value('rating'),
                'stars': get_best_value('stars'),
                'eligibility': get_best_value('eligibility'),
                'weight': 1
            }
            
            G_new.add_edge(source, target, **final_data)
            edges_after += 1
    
    print(f"  Edges: {edges_before} → {edges_after} (+{edges_after - edges_before})")
    print(f"  Players processed: {players_processed}")
    print(f"  Duplicates merged: {duplicates_merged}")
    print()
    
    cleaned_graphs[name] = G_new

# ============================================
# STEP 3: MIN-MAX NORMALIZE RATINGS
# ============================================

print("="*60)
print("NORMALIZING RATINGS")
print("="*60 + "\n")

# First, collect all ratings to find global min and max
all_ratings = []
for name, graph in cleaned_graphs.items():
    for s, t, data in graph.edges(data=True):
        rating_str = data.get('ratings', '')
        if rating_str and rating_str != '' and rating_str != 'None':
            try:
                rating = float(rating_str)
                all_ratings.append(rating)
            except ValueError:
                pass

if all_ratings:
    global_min = np.min(all_ratings)
    global_max = np.max(all_ratings)
    
    print(f"Original rating range: {global_min:.4f} to {global_max:.4f}")
    
    # Normalize to [0, 1]
    for name, graph in cleaned_graphs.items():
        normalized_count = 0
        for s, t, data in graph.edges(data=True):
            rating_str = data.get('ratings', '')
            if rating_str and rating_str != '' and rating_str != 'None':
                try:
                    rating = float(rating_str)
                    # Min-max normalization
                    if global_max > global_min:
                        normalized = (rating - global_min) / (global_max - global_min)
                        data['ratings'] = str(normalized)
                        normalized_count += 1
                except ValueError:
                    pass
        print(f"{name}: Normalized {normalized_count} ratings")
    
    print(f"\nNew rating range: 0.0000 to 1.0000")
else:
    print("Warning: No valid ratings found to normalize!")

# ============================================
# STEP 4: SAVE CLEANED GRAPHS
# ============================================

print("\n" + "="*60)
print("SAVING CLEANED GRAPHS")
print("="*60 + "\n")

output_path = Path(r"C:\Users\User\Documents\cfb project\data\transferportal\cleaned")
output_path.mkdir(parents=True, exist_ok=True)

for name, graph in cleaned_graphs.items():
    output_file = output_path / f"{name}_cleaned.graphml"
    nx.write_graphml(graph, output_file)
    print(f"Saved: {output_file}")
    print(f"  Nodes: {len(graph.nodes())}")
    print(f"  Edges: {len(graph.edges())}")
    print()

print("✓ All graphs cleaned and saved!")

# ============================================
# STEP 5: SUMMARY STATISTICS
# ============================================

print("="*60)
print("FINAL SUMMARY")
print("="*60 + "\n")

summary_data = []
for name in xferfiles.keys():
    original = xferfiles[name]
    cleaned = cleaned_graphs[name]
    
    orig_edges = len(original.edges())
    clean_edges = len(cleaned.edges())
    
    summary_data.append({
        'Graph': name,
        'Original Edges': orig_edges,
        'Cleaned Edges': clean_edges,
        'Change': clean_edges - orig_edges,
        'Percent Increase': ((clean_edges - orig_edges) / orig_edges * 100) if orig_edges > 0 else 0
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

ANALYZING CURRENT DATA STATE

transfer_portal_2021:
  Edges: 958
  Players: 1053
  Multi-player edges: 77

transfer_portal_2022:
  Edges: 1225
  Players: 1367
  Multi-player edges: 113

transfer_portal_2023:
  Edges: 1413
  Players: 1607
  Multi-player edges: 140

transfer_portal_2024:
  Edges: 2340
  Players: 2654
  Multi-player edges: 224

transfer_portal_2025:
  Edges: 3296
  Players: 3765
  Multi-player edges: 337

SPLITTING MULTI-PLAYER EDGES

Processing transfer_portal_2021...
  Edges: 958 → 1051 (+93)
  Players processed: 1053
  Duplicates merged: 2

Processing transfer_portal_2022...
  Edges: 1225 → 1364 (+139)
  Players processed: 1367
  Duplicates merged: 3

Processing transfer_portal_2023...
  Edges: 1413 → 1603 (+190)
  Players processed: 1607
  Duplicates merged: 4

Processing transfer_portal_2024...
  Edges: 2340 → 2646 (+306)
  Players processed: 2654
  Duplicates merged: 8

Processing transfer_portal_2025...
  Edges: 3296 → 3751 (+455)
  Players processed: 3765
  Duplic

In [6]:
# checking missing attributes

# ============================================
# FUNCTION TO ANALYZE MISSING ATTRIBUTES
# ============================================

def analyze_missing_attributes(graph, graph_name):
    """
    Analyzes a graph for missing node and edge attributes
    Returns dictionaries with missing attribute information
    """
    print(f"{'='*60}")
    print(f"Analyzing: {graph_name}")
    print(f"{'='*60}\n")
    
    # --- NODE ATTRIBUTES ---
    print("NODE ATTRIBUTES:")
    print("-" * 40)
    
    # Collect all possible node attributes
    all_node_attrs = set()
    for node, data in graph.nodes(data=True):
        all_node_attrs.update(data.keys())
    
    print(f"Total unique node attributes found: {len(all_node_attrs)}")
    print(f"Attributes: {sorted(all_node_attrs)}\n")
    
    # Check which nodes are missing which attributes
    node_missing_summary = {}
    for attr in all_node_attrs:
        missing_nodes = []
        for node, data in graph.nodes(data=True):
            if attr not in data or data[attr] is None or data[attr] == '' or data[attr] == 0.0 or data[attr] == 'Unknown':
                missing_nodes.append(node)
        
        if missing_nodes:
            node_missing_summary[attr] = {
                'count': len(missing_nodes),
                'percentage': (len(missing_nodes) / len(graph.nodes())) * 100,
                'sample_nodes': missing_nodes[:5]  # First 5 examples
            }
    
    if node_missing_summary:
        print("Missing Node Attributes Summary:")
        for attr, info in sorted(node_missing_summary.items()):
            print(f"  {attr}:")
            print(f"    Missing in {info['count']}/{len(graph.nodes())} nodes ({info['percentage']:.1f}%)")
            print(f"    Sample nodes: {info['sample_nodes']}")
    else:
        print("✓ All nodes have all attributes!")
    
    print()
    
    # --- EDGE ATTRIBUTES ---
    print("EDGE ATTRIBUTES:")
    print("-" * 40)
    
    # Collect all possible edge attributes
    all_edge_attrs = set()
    for s, t, data in graph.edges(data=True):
        all_edge_attrs.update(data.keys())
    
    print(f"Total unique edge attributes found: {len(all_edge_attrs)}")
    print(f"Attributes: {sorted(all_edge_attrs)}\n")
    
    # Check which edges are missing which attributes
    edge_missing_summary = {}
    for attr in all_edge_attrs:
        missing_edges = []
        for s, t, data in graph.edges(data=True):
            if attr not in data or data[attr] is None or data[attr] == '' or data[attr] == 0.0 or data[attr] == 'Unknown':
                missing_edges.append((s, t))
        
        if missing_edges:
            edge_missing_summary[attr] = {
                'count': len(missing_edges),
                'percentage': (len(missing_edges) / len(graph.edges())) * 100,
                'sample_edges': missing_edges[:5]  # First 5 examples
            }
    
    if edge_missing_summary:
        print("Missing Edge Attributes Summary:")
        for attr, info in sorted(edge_missing_summary.items()):
            print(f"  {attr}:")
            print(f"    Missing in {info['count']}/{len(graph.edges())} edges ({info['percentage']:.1f}%)")
            print(f"    Sample edges: {info['sample_edges']}")
    else:
        print("✓ All edges have all attributes!")
    
    print("\n")
    
    return {
        'node_attrs': all_node_attrs,
        'edge_attrs': all_edge_attrs,
        'node_missing': node_missing_summary,
        'edge_missing': edge_missing_summary
    }


# ============================================
# ANALYZE ALL GRAPHS
# ============================================

all_results = {}

# Analyze transfer portal graphs
print("\n" + "="*60)
print("TRANSFER PORTAL GRAPHS")
print("="*60 + "\n")

for name, graph in xferfiles.items():
    results = analyze_missing_attributes(graph, name)
    all_results[name] = results

# Analyze recruiting graphs
print("\n" + "="*60)
print("RECRUITING GRAPHS")
print("="*60 + "\n")

for name, graph in recfiles.items():
    results = analyze_missing_attributes(graph, name)
    all_results[name] = results


# ============================================
# CREATE SUMMARY DATAFRAME
# ============================================

print("\n" + "="*60)
print("OVERALL SUMMARY")
print("="*60 + "\n")

summary_data = []
for graph_name, results in all_results.items():
    summary_data.append({
        'Graph': graph_name,
        'Node Attrs': len(results['node_attrs']),
        'Edge Attrs': len(results['edge_attrs']),
        'Missing Node Attrs': len(results['node_missing']),
        'Missing Edge Attrs': len(results['edge_missing'])
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))


TRANSFER PORTAL GRAPHS

Analyzing: transfer_portal_2021

NODE ATTRIBUTES:
----------------------------------------
Total unique node attributes found: 4
Attributes: ['classification', 'conference', 'latitude', 'longitude']

Missing Node Attributes Summary:
  classification:
    Missing in 19/264 nodes (7.2%)
    Sample nodes: ['Missouri Western State', 'Winston-Salem State', 'City College of San Francisco', 'Mississippi Gulf Coast C.C.', 'Texas-Permian Basin']
  conference:
    Missing in 19/264 nodes (7.2%)
    Sample nodes: ['Missouri Western State', 'Winston-Salem State', 'City College of San Francisco', 'Mississippi Gulf Coast C.C.', 'Texas-Permian Basin']
  latitude:
    Missing in 23/264 nodes (8.7%)
    Sample nodes: ['Missouri Western State', 'Winston-Salem State', 'Northwestern', 'City College of San Francisco', 'Mississippi Gulf Coast C.C.']
  longitude:
    Missing in 23/264 nodes (8.7%)
    Sample nodes: ['Missouri Western State', 'Winston-Salem State', 'Northwestern', 'Ci