# NBA Network-Based Team Strength Analysis

## Objective
Implement network-based strength propagation for team tier classification:
- **Iterative Strength Ratings**: Massey/PageRank-style algorithm
- **Network-Weighted SOS**: Strength of schedule using network ratings
- **Multi-Hop Common Opponents**: 3-level transitive relationships

## NBA-Specific Characteristics
- 30 teams, 82-game regular season
- Very dense graph (multiple matchups per pair)
- High connectivity allows deeper path exploration
- Parameters: `max_hops=3`, `recency_decay=0.95`, `margin_cap=15`

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')

# NBA-specific config
CONFIG = {
    'sport': 'NBA',
    'max_hops': 3,          # Dense graph allows deeper exploration
    'recency_decay': 0.95,  # Season-long patterns matter
    'margin_cap': 15,       # ~15 pts max impact
    'iterations': 100,
    'tolerance': 0.001
}

print(f"Config: {CONFIG}")

## Phase 1: Data Loading & Preparation

In [None]:
# Load NBA data
data_file = Path().resolve().parent / 'data' / 'results' / 'nba_season_results.xlsx'
df = pd.read_excel(data_file)

print(f"Loaded {len(df)} NBA games")
print(f"Date range: {df['game_date'].min().date()} to {df['game_date'].max().date()}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Prepare game-level data with winner/loser/margin
df['home_margin'] = df['home_score'] - df['away_score']
df['winner'] = np.where(df['home_margin'] > 0, df['home_team'], 
                        np.where(df['home_margin'] < 0, df['away_team'], None))
df['loser'] = np.where(df['home_margin'] > 0, df['away_team'], 
                       np.where(df['home_margin'] < 0, df['home_team'], None))
df['margin'] = df['home_margin'].abs()

# Drop ties (if any - rare in NBA due to OT)
games_with_result = df[df['winner'].notna()].copy()
print(f"Games with decisive result: {len(games_with_result)} (dropped {len(df) - len(games_with_result)} ties)")

# Get all teams
all_teams = set(df['home_team'].unique()) | set(df['away_team'].unique())
print(f"Total teams: {len(all_teams)}")

# Calculate avg games per team
games_per_team = (len(df) * 2) / len(all_teams)
print(f"Avg games per team: {games_per_team:.0f}")

## Phase 2: Build Network Graph

In [None]:
def build_team_network(games_df, recency_decay=0.95):
    """
    Build weighted directed graph from game results.
    
    For NBA: Multiple games between same teams common, aggregate results.
    """
    G = nx.DiGraph()
    
    # Add all teams as nodes
    for team in all_teams:
        G.add_node(team)
    
    # Sort by date for recency weighting
    games_sorted = games_df.sort_values('game_date')
    max_date = games_sorted['game_date'].max()
    
    # Build edge data (aggregating multiple matchups)
    edge_data = {}
    
    for _, game in games_sorted.iterrows():
        winner, loser = game['winner'], game['loser']
        margin = game['margin']
        
        # Calculate recency weight (exponential decay)
        days_ago = (max_date - game['game_date']).days
        recency_weight = recency_decay ** (days_ago / 7)  # Weekly decay
        
        key = (winner, loser)
        if key not in edge_data:
            edge_data[key] = {
                'games': 0,
                'total_margin': 0,
                'weighted_margin': 0,
                'total_weight': 0
            }
        
        edge_data[key]['games'] += 1
        edge_data[key]['total_margin'] += margin
        edge_data[key]['weighted_margin'] += margin * recency_weight
        edge_data[key]['total_weight'] += recency_weight
    
    # Add edges to graph
    for (winner, loser), data in edge_data.items():
        avg_margin = data['total_margin'] / data['games']
        weighted_avg = data['weighted_margin'] / data['total_weight']
        
        G.add_edge(winner, loser, 
                   games=data['games'],
                   avg_margin=avg_margin,
                   weighted_margin=weighted_avg)
    
    return G

# Build the network
G = build_team_network(games_with_result, CONFIG['recency_decay'])

print(f"Network Summary:")
print(f"  Nodes (teams): {G.number_of_nodes()}")
print(f"  Edges (winner->loser pairs): {G.number_of_edges()}")
print(f"  Avg out-degree: {sum(dict(G.out_degree()).values()) / G.number_of_nodes():.1f}")
print(f"  Density: {nx.density(G):.3f}")

# Show multi-game matchups
multi_games = [(u, v, d['games']) for u, v, d in G.edges(data=True) if d['games'] > 1]
print(f"\nMulti-game edges (same winner/loser): {len(multi_games)}")
if multi_games:
    print(f"  Example: {multi_games[0][0]} beat {multi_games[0][1]} {multi_games[0][2]} times")

In [None]:
# Visualize the network
fig, ax = plt.subplots(figsize=(14, 14))

# Calculate simple win percentage for coloring
win_counts = dict(G.out_degree())
loss_counts = dict(G.in_degree())
win_pct = {team: win_counts.get(team, 0) / (win_counts.get(team, 0) + loss_counts.get(team, 0) + 0.001)
           for team in all_teams}

# Node colors based on win%
node_colors = [win_pct.get(node, 0.5) for node in G.nodes()]

# Layout
pos = nx.spring_layout(G, k=2, iterations=50, seed=42)

# Draw
nodes = nx.draw_networkx_nodes(G, pos, node_color=node_colors, cmap=plt.cm.RdYlGn,
                               node_size=600, alpha=0.8, vmin=0.2, vmax=0.8, ax=ax)
nx.draw_networkx_labels(G, pos, font_size=7, ax=ax)
nx.draw_networkx_edges(G, pos, alpha=0.1, arrows=True,
                       edge_color='gray', arrowsize=8, ax=ax)

ax.set_title(f'NBA Team Network (Edge: Winner -> Loser)\nNode color: Win %', fontsize=14)
fig.colorbar(nodes, ax=ax, label='Win %')
ax.axis('off')
plt.tight_layout()
plt.show()

## Phase 3: Iterative Strength Rating (Massey-Style)

In [None]:
def compute_iterative_strength(games_df, max_iterations=100, tolerance=0.001, margin_cap=15):
    """
    Compute network-propagated team strength ratings.
    
    For NBA: More games means more updates, so use smaller learning rate.
    """
    teams = set(games_df['home_team']) | set(games_df['away_team'])
    ratings = {team: 0.5 for team in teams}
    games = games_df[games_df['winner'].notna()].copy()
    
    # Smaller learning rate for NBA (more games)
    learning_rate = 0.05
    
    history = []
    
    for iteration in range(max_iterations):
        new_ratings = ratings.copy()
        
        for _, game in games.iterrows():
            winner = game['winner']
            loser = game['loser']
            margin = min(game['margin'], margin_cap)
            
            winner_rating = ratings[winner]
            loser_rating = ratings[loser]
            
            total = winner_rating + loser_rating
            expected = winner_rating / total if total > 0 else 0.5
            
            surprise = 1 - expected
            adjustment = surprise * (margin / margin_cap) * learning_rate
            
            new_ratings[winner] += adjustment
            new_ratings[loser] -= adjustment
        
        # Normalize to [0, 1]
        min_r = min(new_ratings.values())
        max_r = max(new_ratings.values())
        if max_r > min_r:
            new_ratings = {t: (r - min_r) / (max_r - min_r) for t, r in new_ratings.items()}
        
        max_change = max(abs(new_ratings[t] - ratings[t]) for t in teams)
        history.append(max_change)
        
        if max_change < tolerance:
            print(f"Converged at iteration {iteration + 1}")
            break
        
        ratings = new_ratings
    
    return ratings, history

# Compute ratings
network_ratings, convergence_history = compute_iterative_strength(
    df, 
    max_iterations=CONFIG['iterations'],
    tolerance=CONFIG['tolerance'],
    margin_cap=CONFIG['margin_cap']
)

# Display top teams
ratings_df = pd.DataFrame([
    {'team': team, 'network_rating': rating}
    for team, rating in network_ratings.items()
]).sort_values('network_rating', ascending=False)

print("\nTop 10 Teams by Network Rating:")
print(ratings_df.head(10).to_string(index=False))

In [None]:
# Plot convergence
plt.figure(figsize=(10, 4))
plt.plot(convergence_history)
plt.axhline(y=CONFIG['tolerance'], color='r', linestyle='--', label=f"Tolerance ({CONFIG['tolerance']})")
plt.xlabel('Iteration')
plt.ylabel('Max Rating Change')
plt.title('NBA: Convergence of Iterative Strength Algorithm')
plt.legend()
plt.yscale('log')
plt.tight_layout()
plt.show()

## Phase 4: Validate Ratings vs Simple Metrics

In [None]:
# Calculate simple team stats
team_stats = []
for team in all_teams:
    home_games = df[df['home_team'] == team]
    away_games = df[df['away_team'] == team]
    
    home_wins = (home_games['home_score'] > home_games['away_score']).sum()
    away_wins = (away_games['away_score'] > away_games['home_score']).sum()
    
    home_diff = (home_games['home_score'] - home_games['away_score']).sum()
    away_diff = (away_games['away_score'] - away_games['home_score']).sum()
    
    total_games = len(home_games) + len(away_games)
    
    opponents = list(home_games['away_team']) + list(away_games['home_team'])
    
    team_stats.append({
        'team': team,
        'games': total_games,
        'wins': home_wins + away_wins,
        'simple_win_pct': (home_wins + away_wins) / total_games if total_games > 0 else 0,
        'point_diff_avg': (home_diff + away_diff) / total_games if total_games > 0 else 0,
        'network_rating': network_ratings.get(team, 0.5),
        'opponents': opponents
    })

df_teams = pd.DataFrame(team_stats)

# Calculate SOS metrics
win_pct_map = df_teams.set_index('team')['simple_win_pct'].to_dict()
df_teams['simple_sos'] = df_teams['opponents'].apply(
    lambda opps: np.mean([win_pct_map.get(o, 0.5) for o in opps])
)
df_teams['network_sos'] = df_teams['opponents'].apply(
    lambda opps: np.mean([network_ratings.get(o, 0.5) for o in opps])
)

df_teams = df_teams.drop('opponents', axis=1).sort_values('network_rating', ascending=False)
df_teams.head(10)

In [None]:
# Correlation analysis
corr_win_pct = df_teams['network_rating'].corr(df_teams['simple_win_pct'])
corr_point_diff = df_teams['network_rating'].corr(df_teams['point_diff_avg'])
corr_sos = df_teams['network_sos'].corr(df_teams['simple_sos'])

print("Correlation Analysis:")
print(f"  Network Rating vs Win%: {corr_win_pct:.3f}")
print(f"  Network Rating vs Point Diff: {corr_point_diff:.3f}")
print(f"  Network SOS vs Simple SOS: {corr_sos:.3f}")

# Plot
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].scatter(df_teams['simple_win_pct'], df_teams['network_rating'], alpha=0.7)
axes[0].set_xlabel('Simple Win %')
axes[0].set_ylabel('Network Rating')
axes[0].set_title(f'Network Rating vs Win % (r={corr_win_pct:.3f})')

axes[1].scatter(df_teams['point_diff_avg'], df_teams['network_rating'], alpha=0.7)
axes[1].set_xlabel('Avg Point Differential')
axes[1].set_ylabel('Network Rating')
axes[1].set_title(f'Network Rating vs Point Diff (r={corr_point_diff:.3f})')

axes[2].scatter(df_teams['simple_sos'], df_teams['network_sos'], alpha=0.7)
axes[2].set_xlabel('Simple SOS')
axes[2].set_ylabel('Network SOS')
axes[2].set_title(f'Network SOS vs Simple SOS (r={corr_sos:.3f})')

plt.tight_layout()
plt.show()

## Phase 5: Network-Based Tier Classification

In [None]:
# Classify tiers
q75_simple, q25_simple = df_teams['simple_win_pct'].quantile([0.75, 0.25])
q75_network, q25_network = df_teams['network_rating'].quantile([0.75, 0.25])

df_teams['simple_tier'] = df_teams['simple_win_pct'].apply(
    lambda x: 'Elite' if x >= q75_simple else ('Bottom' if x <= q25_simple else 'Mid')
)
df_teams['network_tier'] = df_teams['network_rating'].apply(
    lambda x: 'Elite' if x >= q75_network else ('Bottom' if x <= q25_network else 'Mid')
)

print("Tier Classification Comparison:")
print(f"\nSimple thresholds: Elite >= {q75_simple:.3f}, Bottom <= {q25_simple:.3f}")
print(f"Network thresholds: Elite >= {q75_network:.3f}, Bottom <= {q25_network:.3f}")

# Agreement matrix
agreement = pd.crosstab(df_teams['simple_tier'], df_teams['network_tier'], margins=True)
print(f"\nTier Agreement (rows=Simple, cols=Network):")
print(agreement)

# Disagreements
disagreements = df_teams[df_teams['simple_tier'] != df_teams['network_tier']]
print(f"\nTeams with different classification ({len(disagreements)}):\n")
print(disagreements[['team', 'simple_win_pct', 'network_rating', 'simple_sos', 'network_sos', 
                     'simple_tier', 'network_tier']].to_string(index=False))

## Phase 6: Multi-Hop Common Opponent Analysis

NBA's dense schedule allows deeper path exploration (max_hops=3).

In [None]:
def find_common_opponent_paths(G, team_a, team_b, max_hops=3):
    """Find all paths through common opponents."""
    G_undirected = G.to_undirected()
    
    if team_a not in G or team_b not in G:
        return []
    
    paths = []
    try:
        for path in nx.all_simple_paths(G_undirected, team_a, team_b, cutoff=max_hops + 1):
            if len(path) > 2:
                paths.append(path)
    except nx.NetworkXNoPath:
        pass
    
    return paths

def evaluate_matchup_via_paths(G, team_a, team_b, ratings, max_hops=3):
    """Evaluate teams through common opponent network."""
    paths = find_common_opponent_paths(G, team_a, team_b, max_hops)
    
    if not paths:
        return None, 0, []
    
    a_total_score = 0
    b_total_score = 0
    total_weight = 0
    path_details = []
    
    for path in paths:
        intermediates = path[1:-1]
        a_path_score = 0
        b_path_score = 0
        path_weight = 0
        
        for intermediate in intermediates:
            int_rating = ratings.get(intermediate, 0.5)
            
            # Team A's performance
            if G.has_edge(team_a, intermediate):
                margin = G[team_a][intermediate].get('avg_margin', 0)
                a_path_score += margin * int_rating
            elif G.has_edge(intermediate, team_a):
                margin = G[intermediate][team_a].get('avg_margin', 0)
                a_path_score -= margin * int_rating
            
            # Team B's performance
            if G.has_edge(team_b, intermediate):
                margin = G[team_b][intermediate].get('avg_margin', 0)
                b_path_score += margin * int_rating
            elif G.has_edge(intermediate, team_b):
                margin = G[intermediate][team_b].get('avg_margin', 0)
                b_path_score -= margin * int_rating
            
            path_weight += int_rating
        
        if path_weight > 0:
            a_total_score += a_path_score
            b_total_score += b_path_score
            total_weight += path_weight
            
            path_details.append({
                'path': ' -> '.join(path),
                'hops': len(path) - 2,
                'a_score': a_path_score / path_weight,
                'b_score': b_path_score / path_weight
            })
    
    if total_weight == 0:
        return None, 0, []
    
    edge = (a_total_score - b_total_score) / total_weight
    return edge, len(paths), path_details

# Example matchup
top_team = df_teams.iloc[0]['team']
bottom_team = df_teams.iloc[-1]['team']

edge, num_paths, details = evaluate_matchup_via_paths(G, top_team, bottom_team, network_ratings, CONFIG['max_hops'])

print(f"Matchup Analysis: {top_team} vs {bottom_team}")
print(f"Paths found: {num_paths}")
print(f"Common opponent edge: {edge:.2f} ({'favors ' + top_team if edge > 0 else 'favors ' + bottom_team})")

# Path breakdown by hops
if details:
    path_df = pd.DataFrame(details)
    print(f"\nPaths by hop count:")
    print(path_df.groupby('hops').size())
    print(f"\nSample paths:")
    for d in details[:3]:
        print(f"  {d['path']} ({d['hops']}-hop)")

## Phase 7: Coverage Analysis by Network Tier

In [None]:
# Calculate handicap coverage
handicaps = [0, 5, 7, 9, 11, 13]

for h in handicaps:
    df[f'home_covers_{h}pt'] = (df['spread_result_difference'] + h) >= 0
    df[f'away_covers_{h}pt'] = df['spread_result_difference'] <= h

# Analyze coverage by tier
results = []

for h in handicaps:
    for tier_type in ['simple_tier', 'network_tier']:
        for tier in ['Elite', 'Mid', 'Bottom']:
            tier_teams = df_teams[df_teams[tier_type] == tier]['team'].tolist()
            
            total_games = 0
            total_covers = 0
            
            for team in tier_teams:
                hg = df[df['home_team'] == team]
                total_games += len(hg)
                total_covers += hg[f'home_covers_{h}pt'].sum()
                
                ag = df[df['away_team'] == team]
                total_games += len(ag)
                total_covers += ag[f'away_covers_{h}pt'].sum()
            
            results.append({
                'handicap': h,
                'tier_type': tier_type,
                'tier': tier,
                'games': total_games,
                'covers': total_covers,
                'cover_pct': total_covers / total_games if total_games > 0 else 0
            })

df_coverage = pd.DataFrame(results)

print("Coverage % by Tier Type and Handicap:")
print("="*60)

for tier_type in ['simple_tier', 'network_tier']:
    print(f"\n{tier_type.replace('_', ' ').title()}:")
    pivot = df_coverage[df_coverage['tier_type'] == tier_type].pivot(
        index='handicap', columns='tier', values='cover_pct'
    )[['Elite', 'Mid', 'Bottom']]
    print((pivot * 100).round(1))

In [None]:
# Plot coverage comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

colors = {'Elite': '#2ecc71', 'Mid': '#f39c12', 'Bottom': '#e74c3c'}

for idx, tier_type in enumerate(['simple_tier', 'network_tier']):
    ax = axes[idx]
    
    for tier in ['Elite', 'Mid', 'Bottom']:
        data = df_coverage[(df_coverage['tier_type'] == tier_type) & (df_coverage['tier'] == tier)]
        ax.plot(data['handicap'], data['cover_pct'] * 100, 
                marker='o', label=tier, color=colors[tier], linewidth=2)
    
    ax.axhline(y=50, color='gray', linestyle='--', alpha=0.5)
    ax.set_xlabel('Handicap Points')
    ax.set_ylabel('Coverage %')
    ax.set_title(f'{tier_type.replace("_", " ").title()}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle('NBA: Coverage by Tier Classification Method', fontsize=14)
plt.tight_layout()
plt.show()

## Summary & Recommendations

In [None]:
print("="*70)
print("NBA NETWORK STRENGTH ANALYSIS: SUMMARY")
print("="*70)

print(f"\n1. DATA")
print(f"   - Games analyzed: {len(df)}")
print(f"   - Teams: {len(all_teams)}")
print(f"   - Network edges: {G.number_of_edges()}")
print(f"   - Avg games per team: {(len(df)*2)/len(all_teams):.0f}")

print(f"\n2. NETWORK RATING VALIDATION")
print(f"   - Correlation with Win%: {corr_win_pct:.3f}")
print(f"   - Correlation with Point Diff: {corr_point_diff:.3f}")
print(f"   - Network SOS vs Simple SOS: {corr_sos:.3f}")

print(f"\n3. TIER CLASSIFICATION")
tier_agreement = (df_teams['simple_tier'] == df_teams['network_tier']).mean() * 100
print(f"   - Agreement rate: {tier_agreement:.1f}%")
print(f"   - Disagreements: {len(disagreements)} teams")

print(f"\n4. MULTI-HOP ANALYSIS")
print(f"   - Max hops: {CONFIG['max_hops']}")
print(f"   - Sample paths found: {num_paths}")

print(f"\n5. COVERAGE BY TIER (9pt handicap - NBA standard)")
for tier_type in ['simple_tier', 'network_tier']:
    print(f"   {tier_type.replace('_', ' ').title()}:")
    for tier in ['Elite', 'Mid', 'Bottom']:
        pct = df_coverage[(df_coverage['tier_type'] == tier_type) & 
                         (df_coverage['tier'] == tier) &
                         (df_coverage['handicap'] == 9)]['cover_pct'].values[0] * 100
        print(f"      {tier}: {pct:.1f}%")

In [None]:
# Export team data
export_df = df_teams[['team', 'games', 'wins', 'simple_win_pct', 'network_rating', 
                      'point_diff_avg', 'simple_sos', 'network_sos', 
                      'simple_tier', 'network_tier']].copy()

export_file = Path().resolve().parent / 'data' / 'results' / 'nba_network_ratings.csv'
export_df.to_csv(export_file, index=False)
print(f"Exported team ratings to: {export_file}")

export_df.head(10)