In [1]:
import polars as pl
# import preprocessing

weeks = []
for week in range(1,10):
    df = pl.read_csv(f"/Users/zekeweng/Dropbox/BigDataBowl/presnap/week_{week}.csv", null_values=["NA"])
    weeks.append(df)

df = pl.concat(weeks)

# offense = df.filter(pl.col("club") == pl.col("possessionTeam"))
# defense = df.filter(pl.col("club") == pl.col("defensiveTeam"))

### Types of ZoneHybridMan

In [9]:
coverage_counts = (
    df
    .group_by(["gameId", "playId"])
    .agg(pl.col("pff_manZone").first().alias("pff_manZone"))
    .group_by("pff_manZone")
    .agg(pl.len().alias("count"))
    .sort("pff_manZone")
)

print(coverage_counts)

total_samples = coverage_counts["count"].sum()
num_classes = len(coverage_counts)

weights = coverage_counts.with_columns([
    ((total_samples / (num_classes * pl.col("count"))).round(2)).alias("weight")
]).with_columns([
    ((pl.col("weight") * 2).round(2)).alias("scaled_weight")
]).select(["pff_manZone", "scaled_weight"])

print(weights)

shape: (2, 2)
┌─────────────┬───────┐
│ pff_manZone ┆ count │
│ ---         ┆ ---   │
│ str         ┆ u32   │
╞═════════════╪═══════╡
│ Man         ┆ 3460  │
│ Zone        ┆ 9761  │
└─────────────┴───────┘
shape: (2, 2)
┌─────────────┬───────────────┐
│ pff_manZone ┆ scaled_weight │
│ ---         ┆ ---           │
│ str         ┆ f64           │
╞═════════════╪═══════════════╡
│ Man         ┆ 3.82          │
│ Zone        ┆ 1.36          │
└─────────────┴───────────────┘


### Types of Pass Coverage

In [10]:
coverage_counts = (
    df
    .group_by(["gameId", "playId"])
    .agg(pl.col("defCoverage").first().alias("defCoverage"))
    .group_by("defCoverage")
    .agg(pl.len().alias("count"))
    .sort("defCoverage")
)

print(coverage_counts)

total_samples = coverage_counts["count"].sum()
num_classes = len(coverage_counts)

weights = coverage_counts.with_columns([
    ((total_samples / (num_classes * pl.col("count"))).round(2)).alias("weight")
]).with_columns([
    ((pl.col("weight") * 2).round(2)).alias("scaled_weight")
]).select(["defCoverage", "scaled_weight"])


print(weights)


shape: (6, 2)
┌─────────────┬───────┐
│ defCoverage ┆ count │
│ ---         ┆ ---   │
│ str         ┆ u32   │
╞═════════════╪═══════╡
│ Cover-0     ┆ 492   │
│ Cover-1     ┆ 2968  │
│ Cover-2     ┆ 1702  │
│ Cover-3     ┆ 5010  │
│ Cover-6     ┆ 1243  │
│ Quarters    ┆ 1806  │
└─────────────┴───────┘
shape: (6, 2)
┌─────────────┬───────────────┐
│ defCoverage ┆ scaled_weight │
│ ---         ┆ ---           │
│ str         ┆ f64           │
╞═════════════╪═══════════════╡
│ Cover-0     ┆ 8.96          │
│ Cover-1     ┆ 1.48          │
│ Cover-2     ┆ 2.58          │
│ Cover-3     ┆ 0.88          │
│ Cover-6     ┆ 3.54          │
│ Quarters    ┆ 2.44          │
└─────────────┴───────────────┘


### Types of Offensive Plays (run / pass)

In [12]:
offensive_plays = (
    df
    .group_by(["gameId", "playId"])
    .agg(pl.col("runPass").first().alias("runPass"))
    .group_by("runPass")
    .agg(pl.len().alias("count"))
    .sort("runPass")
)

print(offensive_plays)

total_samples = offensive_plays["count"].sum()
num_classes = len(offensive_plays)

weights = offensive_plays.with_columns([
    ((total_samples / (num_classes * pl.col("count"))).round(2)).alias("weight")
]).with_columns([
    ((pl.col("weight") * 2).round(2)).alias("scaled_weight")
]).select(["runPass", "scaled_weight"])



print(weights)


shape: (2, 2)
┌─────────┬───────┐
│ runPass ┆ count │
│ ---     ┆ ---   │
│ str     ┆ u32   │
╞═════════╪═══════╡
│ PASS    ┆ 8190  │
│ RUN     ┆ 5031  │
└─────────┴───────┘
shape: (2, 2)
┌─────────┬───────────────┐
│ runPass ┆ scaled_weight │
│ ---     ┆ ---           │
│ str     ┆ f64           │
╞═════════╪═══════════════╡
│ PASS    ┆ 1.62          │
│ RUN     ┆ 2.62          │
└─────────┴───────────────┘


### Motions in each defCov

In [None]:
grouped = df.group_by(["gameId", "playId"], maintain_order=True).agg([
    pl.col("inMotionAtBallSnap").max().alias("inMotionAtBallSnap"),
    pl.col("shiftSinceLineset").max().alias("shiftSinceLineset"),
    pl.col("motionSinceLineset").max().alias("motionSinceLineset"),
])

df.group_by("defCoverage", maintain_order=True).agg([
    pl.col("inMotionAtBallSnap").mean().alias("inMotionAtBallSnap"),
    pl.col("shiftSinceLineset").mean().alias("shiftSinceLineset"),
    pl.col("motionSinceLineset").mean().alias("motionSinceLineset"),
])

### Number of Presnap Frames (per play)

In [None]:
import polars as pl
import matplotlib.pyplot as plt

result = (
    df
    .group_by(["gameId", "playId"])
    .agg(pl.col("frameId").max().alias("max_frameId"))
    .sort(["gameId", "playId"])
)

max_frame_ids = result["max_frameId"].to_pandas()

plt.figure(figsize=(8, 6))
plt.hist(max_frame_ids, bins=200, edgecolor="black", alpha=0.7)
plt.xlabel("Frames", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Frequency of Presnap Frames per Play", fontsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
/Users/zekeweng/Dropbox/BigDataBowl/Model/Features/features_test.pt