In [1]:
import subprocess
import os
from pathlib import Path

print("=" * 80)
print("CELL 1: CLONING GITHUB REPOSITORY")
print("=" * 80)

repo_url = "https://github.com/aravinds-kannappan/nfl-veteran-transition.git"
repo_name = "nfl-veteran-transition"

print(f"\n▶ Cloning repository: {repo_url}\n")

if not os.path.exists(repo_name):
    result = subprocess.run(
        ["git", "clone", repo_url],
        capture_output=True,
        text=True
    )
    if result.returncode == 0:
        print("✓ Repository cloned successfully")
    else:
        print(f"✗ Error: {result.stderr}")
else:
    print(f"✓ Repository already exists")

# Change directory
os.chdir(repo_name)
print(f"✓ Working directory: {os.getcwd()}\n")

# Create output directories
output_dirs = ["outputs", "outputs/models", "outputs/analysis", "outputs/figures"]
for dir_path in output_dirs:
    Path(dir_path).mkdir(parents=True, exist_ok=True)

print("✓ Output directories created")
print("\n✓ CELL 1 COMPLETE\n")


CELL 1: CLONING GITHUB REPOSITORY

▶ Cloning repository: https://github.com/aravinds-kannappan/nfl-veteran-transition.git

✓ Repository cloned successfully
✓ Working directory: /content/nfl-veteran-transition

✓ Output directories created

✓ CELL 1 COMPLETE



In [2]:
print("=" * 80)
print("CELL 2: INSTALLING DEPENDENCIES")
print("=" * 80)

packages = [
    "pandas",
    "numpy",
    "scikit-learn",
    "matplotlib",
    "seaborn",
    "xgboost",
    "lightgbm"
]

print("\n▶ Installing required packages...\n")

for package in packages:
    try:
        __import__(package.replace("-", "_"))
        print(f"✓ {package} already installed")
    except ImportError:
        print(f"  Installing {package}...")
        subprocess.run(
            ["pip", "install", "-q", package],
            capture_output=True
        )
        print(f"✓ {package} installed")

print("\n✓ CELL 2 COMPLETE\n")

CELL 2: INSTALLING DEPENDENCIES

▶ Installing required packages...

✓ pandas already installed
✓ numpy already installed
  Installing scikit-learn...
✓ scikit-learn installed
✓ matplotlib already installed
✓ seaborn already installed
✓ xgboost already installed
✓ lightgbm already installed

✓ CELL 2 COMPLETE



In [12]:
print("=" * 80)
print("CELL 3: VERIFYING DATA & SETUP")
print("=" * 80)

import pandas as pd

csv_path = "data/processed/nfl_panel_for_python.csv"

print(f"\n▶ Loading data from: {csv_path}\n")

try:
    df = pd.read_csv(csv_path)
    print(f"✓ Data loaded successfully")
    print(f"  • Rows: {len(df):,}")
    print(f"  • Columns: {len(df.columns)}")
    print(f"  • Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"  • Missing values: {df.isnull().sum().sum()}")
    print(f"  • Duplicates: {df.duplicated().sum()}")

    print(f"\n✓ Sample data:")
    print(df.head())

except Exception as e:
    print(f"✗ Error: {e}")

# Verify scripts exist
print(f"\n▶ Verifying Python scripts:\n")

scripts = [
    "src/python/data_collection.py",
    "src/python/preprocessing.py",
    "src/python/feature_engineering.py",
    "src/python/modeling.py",
    "src/python/main.py"
]

all_exist = True
for script in scripts:
    if os.path.exists(script):
        print(f"✓ {script}")
    else:
        print(f"✗ {script} NOT FOUND")
        all_exist = False

if all_exist:
    print("\n✓ All scripts found!")
else:
    print("\n✗ Some scripts missing!")

print("\n✓ CELL 3 COMPLETE\n")

CELL 3: VERIFYING DATA & SETUP

▶ Loading data from: data/processed/nfl_panel_for_python.csv

✓ Data loaded successfully
  • Rows: 599
  • Columns: 28
  • Memory: 0.28 MB
  • Missing values: 1170
  • Duplicates: 0

✓ Sample data:
      gsis_id  season  metric_raw  volume  success_rate  total_epa team  \
0  00-0019596    2019    0.021802     642           NaN        NaN   NE   
1  00-0019596    2021    0.184091     742           NaN        NaN   TB   
2  00-0019596    2022    0.054289     761           NaN        NaN   TB   
3  00-0021206    2015    0.011931     319           NaN        NaN  CLE   
4  00-0021206    2017    0.010371     437           NaN        NaN  NYJ   

  position_group  metric_secondary  mean_season  ...  new_team  rel_time  \
0             QB          0.419003     0.050641  ...        TB        -2   
1             QB          0.506739     0.014965  ...        TB         0   
2             QB          0.473062     0.004703  ...        TB         1   
3             Q

In [22]:
import subprocess
import sys
from pathlib import Path
from datetime import datetime
import traceback
import json
import os

# Set environment variable for correct data path
os.environ['CSV_PATH'] = 'data/processed/nfl_panel_for_python.csv'

# Configuration
PYTHON_SCRIPTS = [
    ("src/python/data_collection.py", "Data Collection & Validation"),
    ("src/python/preprocessing.py", "Data Preprocessing & Standardization"),
    ("src/python/feature_engineering.py", "Feature Engineering"),
    ("src/python/modeling.py", "Predictive Modeling & Analysis")
]

LOG_FILE = Path("outputs") / "pipeline_execution.log"
SUMMARY_FILE = Path("outputs") / "pipeline_summary.json"

Path("outputs").mkdir(parents=True, exist_ok=True)

print("=" * 80)
print("CELL 4: RUNNING COMPLETE PIPELINE")
print("=" * 80)

# Verify data exists
csv_path = Path("data/processed/nfl_panel_for_python.csv")
if csv_path.exists():
    print(f"\n✓ Data found: {csv_path}")
else:
    print(f"\n✗ Data not found: {csv_path}")
    print("Make sure the CSV file is in the correct location!")

def run_script(script_name, description, idx, total):
    """Run a single Python script."""
    script_path = Path(script_name)

    print(f"\n{'=' * 80}")
    print(f"SCRIPT {idx}/{total}: {description}")
    print(f"{'=' * 80}\n")

    if not script_path.exists():
        print(f"✗ Script not found: {script_path}")
        return False, 0

    start_time = datetime.now()

    try:
        result = subprocess.run(
            [sys.executable, str(script_path)],
            capture_output=True,
            text=True,
            timeout=3600
        )

        elapsed = (datetime.now() - start_time).total_seconds()

        # Print output
        if result.stdout:
            print(result.stdout)

        if result.stderr:
            print(f"STDERR:\n{result.stderr}")

        if result.returncode == 0:
            print(f"\n✓ {description} completed in {elapsed:.2f}s")
            return True, elapsed
        else:
            print(f"\n✗ {description} failed with return code {result.returncode}")
            return False, elapsed

    except subprocess.TimeoutExpired:
        print(f"✗ {description} timed out after 1 hour")
        return False, 3600

    except Exception as e:
        print(f"✗ Error: {str(e)}")
        return False, 0

# Execute scripts
results = []
execution_times = {}

for idx, (script_name, description) in enumerate(PYTHON_SCRIPTS, 1):
    success, elapsed = run_script(script_name, description, idx, len(PYTHON_SCRIPTS))
    results.append((script_name, success, description))
    execution_times[script_name] = elapsed

    if not success:
        print(f"\nStopping pipeline due to {script_name} failure")
        break

# Calculate total time
total_time = sum(execution_times.values())

# Print summary
print("\n" + "=" * 80)
print("PIPELINE EXECUTION SUMMARY")
print("=" * 80)

successful = [r for r in results if r[1]]
failed = [r for r in results if not r[1]]

print(f"\n✓ Successful: {len(successful)}/{len(results)}")
for script_name, success, description in successful:
    elapsed = execution_times.get(script_name, 0)
    print(f"  • {description} ({elapsed:.2f}s)")

if failed:
    print(f"\n✗ Failed: {len(failed)}/{len(results)}")
    for script_name, success, description in failed:
        print(f"  ✗ {description}")

print(f"\nTotal Pipeline Time: {total_time:.2f}s ({total_time/60:.2f} minutes)")

all_successful = all(r[1] for r in results)

if all_successful:
    print("\n✓ PIPELINE COMPLETED SUCCESSFULLY!")
else:
    print("\n✗ PIPELINE COMPLETED WITH FAILURES")

print("=" * 80)
print("\n✓ CELL 4 COMPLETE\n")

CELL 4: RUNNING COMPLETE PIPELINE

✓ Data found: data/processed/nfl_panel_for_python.csv

SCRIPT 1/4: Data Collection & Validation

PYTHON DATA LOADING AND VALIDATION PIPELINE

1. Loading nfl_panel_for_python.csv...
   ✓ Successfully loaded: 599 rows × 28 columns

2. Inspecting data structure...

   Column names and types:
     • gsis_id: object
     • season: int64
     • metric_raw: float64
     • volume: int64
     • success_rate: float64
     • total_epa: float64
     • team: object
     • position_group: object
     • metric_secondary: float64
     • mean_season: float64
     • sd_season: float64
     • z_score: float64
     • years_exp: int64
     • age: float64
     • prev_team: object
     • changed_team: int64
     • years_since_change: int64
     • transition_season: int64
     • new_team: object
     • rel_time: int64
     • post_transition: int64
     • phase: object
     • team_pos_avg: float64
     • team_quality: float64
     • team_pass_epa_per_play: float64
     • team

In [23]:
print("=" * 80)
print("CELL 5: DISPLAYING RESULTS")
print("=" * 80)

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

results_dir = Path("outputs/analysis")

# Check what files were created
print("\n▶ Output files created:\n")

if results_dir.exists():
    for file in sorted(results_dir.glob("*.csv")):
        size = file.stat().st_size / 1024
        print(f"✓ {file.name} ({size:.1f} KB)")
else:
    print("✗ No analysis directory found")

# Load and display model comparison
print("\n" + "=" * 80)
print("MODEL COMPARISON")
print("=" * 80)

model_comp_path = results_dir / "model_comparison.csv"
if model_comp_path.exists():
    model_comparison = pd.read_csv(model_comp_path)
    print("\n" + model_comparison.to_string(index=False))
    print()
else:
    print("Model comparison file not found")

# Load and display feature importance
print("=" * 80)
print("TOP 15 IMPORTANT FEATURES")
print("=" * 80)

feature_imp_path = results_dir / "feature_importance.csv"
if feature_imp_path.exists():
    feature_importance = pd.read_csv(feature_imp_path)
    print("\n" + feature_importance.head(15).to_string(index=False))
    print()
else:
    print("Feature importance file not found")

# Display figures
print("=" * 80)
print("VISUALIZATIONS")
print("=" * 80)

figs_dir = Path("outputs/figures")
if figs_dir.exists():
    figures = list(figs_dir.glob("*.png"))
    print(f"\n✓ {len(figures)} figures generated:\n")

    for fig in sorted(figures):
        print(f"  • {fig.name}")

print("\n✓ CELL 5 COMPLETE\n")

CELL 5: DISPLAYING RESULTS

▶ Output files created:

✓ feature_importance.csv (0.4 KB)
✓ feature_importance_global.csv (0.5 KB)
✓ feature_importance_secondary.csv (0.5 KB)
✓ model_comparison.csv (0.3 KB)
✓ predictions.csv (26.6 KB)
✓ predictions_research_question.csv (5.3 KB)
✓ predictions_secondary.csv (23.0 KB)

MODEL COMPARISON

            Model       R2     RMSE      MAE
          XGBoost 0.280444 1.053901 0.844994
            Ridge 0.263358 1.066340 0.870440
            Lasso 0.257215 1.070776 0.864993
    Random Forest 0.256035 1.071627 0.869984
Gradient Boosting 0.248362 1.077139 0.881580

TOP 15 IMPORTANT FEATURES

               feature  importance
          team_pos_avg    0.293631
             total_epa    0.128901
                   age    0.101058
team_pass_epa_per_play    0.092759
          team_quality    0.060471
            metric_raw    0.058338
                volume    0.052367
           mean_season    0.046496
          success_rate    0.042894
team_rush_epa_per_