In [None]:
import networkx as nx
import os
from pathlib import Path
import pandas as pd
import numpy as np
import math

In [None]:
# setup

xferpath = Path(r"C:\Users\User\Documents\cfb project\data\transferportal\cleaned")
recpath = Path(r"C:\Users\User\Documents\cfb project\data\recruiting\final")
xferfiles = {}
recfiles = {}

for file_path in xferpath.glob('*.graphml'):
    name = file_path.stem
    G_x = nx.read_graphml(file_path)
    xferfiles[name] = G_x

for file_path in recpath.glob('*.graphml'):
    name = file_path.stem
    G_r = nx.read_graphml(file_path)
    recfiles[name] = G_r

print(f"\nSuccessfully loaded {len(xferfiles)} files from xfer and {len(recfiles)} files from rec.")


In [None]:
# net degree (xfer)

import networkx as nx
import pandas as pd
import numpy as np
from pathlib import Path

# Load cleaned transfer portal data
xferpath = Path(r"C:\Users\User\Documents\cfb project\data\transferportal\cleaned")
xferfiles = {}

for file_path in xferpath.glob('*_cleaned.graphml'):
    name = file_path.stem.replace('_cleaned', '')
    G = nx.read_graphml(file_path)
    xferfiles[name] = G

print(f"Loaded {len(xferfiles)} transfer portal files\n")

# Calculate net degree for each team in each season



In [None]:
# npv calculation with uncertainty bounds (xfer)

import networkx as nx
import pandas as pd
import numpy as np
from pathlib import Path

# Load cleaned transfer portal data
xferpath = Path(r"C:\Users\User\Documents\cfb project\data\transferportal\cleaned")
xferfiles = {}

for file_path in xferpath.glob('*_cleaned.graphml'):
    name = file_path.stem.replace('_cleaned', '')
    G = nx.read_graphml(file_path)
    xferfiles[name] = G

print(f"Loaded {len(xferfiles)} transfer portal files\n")

# ============================================
# ANALYZE MISSING DATA PATTERNS
# ============================================

print("="*70)
print("MISSING DATA ANALYSIS")
print("="*70 + "\n")

missing_analysis = []

for name, G in xferfiles.items():
    year = int(name.split('_')[-1])
    
    total_edges = G.number_of_edges()
    missing_rating = 0
    missing_stars = 0
    has_rating = 0
    has_stars = 0
    
    # Track ratings by star level for missing data
    missing_by_star = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 'unknown': 0}
    present_by_star = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
    
    for u, v, data in G.edges(data=True):
        rating = data.get('ratings', '')
        stars = data.get('stars', '')
        
        # Check rating presence
        if not rating or rating in ['', '0.0', 'None']:
            missing_rating += 1
            
            # What star level is this missing rating?
            if stars and stars not in ['', 'None']:
                try:
                    star_val = int(float(stars))
                    if 1 <= star_val <= 5:
                        missing_by_star[star_val] += 1
                    else:
                        missing_by_star['unknown'] += 1
                except:
                    missing_by_star['unknown'] += 1
            else:
                missing_by_star['unknown'] += 1
        else:
            has_rating += 1
            
            # Track star distribution of present ratings
            if stars and stars not in ['', 'None']:
                try:
                    star_val = int(float(stars))
                    if 1 <= star_val <= 5:
                        present_by_star[star_val] += 1
                except:
                    pass
        
        # Check stars presence
        if not stars or stars in ['', 'None']:
            missing_stars += 1
        else:
            has_stars += 1
    
    missing_pct = (missing_rating / total_edges * 100) if total_edges > 0 else 0
    
    missing_analysis.append({
        'Year': year,
        'Total Edges': total_edges,
        'Missing Ratings': missing_rating,
        'Missing %': missing_pct,
        'Missing 2-star': missing_by_star[2],
        'Missing 3-star': missing_by_star[3],
        'Missing 4-star': missing_by_star[4],
        'Missing 5-star': missing_by_star[5],
        'Missing Unknown': missing_by_star['unknown'],
        'Present 2-star': present_by_star[2],
        'Present 3-star': present_by_star[3],
        'Present 4-star': present_by_star[4],
        'Present 5-star': present_by_star[5]
    })
    
    print(f"{year}:")
    print(f"  Total transfers: {total_edges}")
    print(f"  Missing ratings: {missing_rating} ({missing_pct:.1f}%)")
    print(f"  Missing by star level:")
    print(f"    2-star: {missing_by_star[2]}")
    print(f"    3-star: {missing_by_star[3]}")
    print(f"    4-star: {missing_by_star[4]}")
    print(f"    5-star: {missing_by_star[5]}")
    print(f"    Unknown: {missing_by_star['unknown']}")
    print()

missing_df = pd.DataFrame(missing_analysis)
print("\nSummary Table:")
print(missing_df.to_string(index=False))

# ============================================
# CALCULATE NPV WITH UNCERTAINTY BOUNDS
# ============================================

print("\n" + "="*70)
print("NPV CALCULATION WITH CONFIDENCE INTERVALS")
print("="*70 + "\n")

def calculate_npv_with_bounds(G, year):
    """
    Calculate NPV for each school with uncertainty bounds
    based on missing data.
    """
    school_stats = {}
    
    for node in G.nodes():
        in_transfers = []
        out_transfers = []
        in_missing = 0
        out_missing = 0
        
        # Incoming transfers
        for pred in G.predecessors(node):
            for key, data in G[pred][node].items() if isinstance(G, nx.MultiDiGraph) else [(0, G[pred][node])]:
                rating = data.get('ratings', '')
                if rating and rating not in ['', '0.0', 'None']:
                    in_transfers.append(float(rating))
                else:
                    in_missing += 1
        
        # Outgoing transfers
        for succ in G.successors(node):
            for key, data in G[node][succ].items() if isinstance(G, nx.MultiDiGraph) else [(0, G[node][succ])]:
                rating = data.get('ratings', '')
                if rating and rating not in ['', '0.0', 'None']:
                    out_transfers.append(float(rating))
                else:
                    out_missing += 1
        
        # Calculate NPV components
        in_sum = sum(in_transfers)
        out_sum = sum(out_transfers)
        npv = in_sum - out_sum
        
        # Calculate uncertainty bounds (assume missing values could be ±0.1)
        # Best case: missing incoming are high (0.95), missing outgoing are low (0.75)
        # Worst case: missing incoming are low (0.75), missing outgoing are high (0.95)
        best_case_npv = (in_sum + in_missing * 0.95) - (out_sum + out_missing * 0.75)
        worst_case_npv = (in_sum + in_missing * 0.75) - (out_sum + out_missing * 0.95)
        
        uncertainty = (best_case_npv - worst_case_npv) / 2
        
        school_stats[node] = {
            'npv': npv,
            'net_degree': len(in_transfers) - len(out_transfers),
            'in_count': len(in_transfers),
            'out_count': len(out_transfers),
            'in_missing': in_missing,
            'out_missing': out_missing,
            'total_missing': in_missing + out_missing,
            'uncertainty': uncertainty,
            'best_case': best_case_npv,
            'worst_case': worst_case_npv
        }
    
    return school_stats

# Calculate for most recent year
most_recent_year = max([int(name.split('_')[-1]) for name in xferfiles.keys()])
most_recent_graph_name = f"transfer_portal_{most_recent_year}"
G_recent = xferfiles[most_recent_graph_name]

stats = calculate_npv_with_bounds(G_recent, most_recent_year)

# Convert to DataFrame
results_df = pd.DataFrame.from_dict(stats, orient='index')
results_df['school'] = results_df.index
results_df = results_df.sort_values('npv', ascending=False)

print(f"Top 20 Schools by NPV ({most_recent_year}) with Uncertainty:")
print("="*70)
top20 = results_df.head(20)[['school', 'npv', 'net_degree', 'in_count', 'out_count', 
                               'total_missing', 'uncertainty', 'best_case', 'worst_case']]
print(top20.to_string(index=False))

print("\n" + "="*70)
print("KEY INSIGHTS:")
print("="*70)
print(f"\n1. Average uncertainty across all schools: ±{results_df['uncertainty'].mean():.2f}")
print(f"2. Max uncertainty: ±{results_df['uncertainty'].max():.2f} ({results_df.loc[results_df['uncertainty'].idxmax(), 'school']})")
print(f"3. Schools with >5 missing transfers: {len(results_df[results_df['total_missing'] > 5])}")

# Check ranking stability
print("\n4. Ranking Stability Analysis:")
results_df['rank'] = range(1, len(results_df) + 1)
results_df_best = results_df.copy()
results_df_best = results_df_best.sort_values('best_case', ascending=False)
results_df_best['rank_best'] = range(1, len(results_df_best) + 1)

results_df = results_df.merge(results_df_best[['school', 'rank_best']], on='school')
results_df['rank_change'] = abs(results_df['rank'] - results_df['rank_best'])

print(f"   Average rank change in best-case scenario: {results_df['rank_change'].mean():.1f} positions")
print(f"   Max rank change: {results_df['rank_change'].max():.0f} positions")
print(f"   Schools with rank change >5: {len(results_df[results_df['rank_change'] > 5])}")

print("\n5. Most Affected Schools (highest uncertainty):")
most_affected = results_df.nlargest(5, 'uncertainty')[['school', 'npv', 'total_missing', 'uncertainty', 'rank_change']]
print(most_affected.to_string(index=False))

Loaded 5 transfer portal files

MISSING DATA ANALYSIS

2021:
  Total transfers: 1051
  Missing ratings: 105 (10.0%)
  Missing by star level:
    2-star: 0
    3-star: 0
    4-star: 0
    5-star: 0
    Unknown: 104

2022:
  Total transfers: 1364
  Missing ratings: 123 (9.0%)
  Missing by star level:
    2-star: 0
    3-star: 0
    4-star: 0
    5-star: 0
    Unknown: 123

2023:
  Total transfers: 1603
  Missing ratings: 39 (2.4%)
  Missing by star level:
    2-star: 0
    3-star: 0
    4-star: 0
    5-star: 0
    Unknown: 39

2024:
  Total transfers: 2646
  Missing ratings: 213 (8.0%)
  Missing by star level:
    2-star: 0
    3-star: 0
    4-star: 0
    5-star: 0
    Unknown: 213

2025:
  Total transfers: 3751
  Missing ratings: 480 (12.8%)
  Missing by star level:
    2-star: 0
    3-star: 0
    4-star: 0
    5-star: 0
    Unknown: 480


Summary Table:
 Year  Total Edges  Missing Ratings  Missing %  Missing 2-star  Missing 3-star  Missing 4-star  Missing 5-star  Missing Unknown  Prese