# Assignment 5 — Task 1: Commentary Analysing

Abhinav Kumar
11/26/2025

In [1]:
from __future__ import annotations
import re
from dataclasses import dataclass, field
from typing import List, Dict, Tuple, Optional
from collections import defaultdict
import csv

In [2]:
def parse_time_any(s: str) -> Optional[int]:
    m = re.search(r"\(?(\d{1,2}:)?\d{1,2}:\d{2}\)?", s.strip())
    if not m:
        return None
    ts = m.group(0).strip("()")
    parts = [int(p) for p in ts.split(":")]
    if len(parts) == 3:
        h, m_, s_ = parts
        return h * 3600 + m_ * 60 + s_
    elif len(parts) == 2:
        m_, s_ = parts
        return m_ * 60 + s_
    return None

def fmt_time(sec: int) -> str:
    h = sec // 3600
    m = (sec % 3600) // 60
    s = sec % 60
    return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"


In [3]:
@dataclass
class Event:
    t: Optional[int]
    text: str
    points: int = 0
    kind: str = "note"  # "score" or "note"

@dataclass
class PlayerStats:
    name: str
    total: int = 0
    timeline: List[Event] = field(default_factory=list)
    first_fg_time: Optional[int] = None
    notes: List[Tuple[Optional[int], str]] = field(default_factory=list)


In [4]:
SCORING_RULES = [
    (re.compile(r"\b(three|3-?pointer|from (?:deep|downtown))\b", re.I), 3),
    (re.compile(r"\bfree throw\b", re.I), 1),
    (re.compile(r"\b(layup|dunk|hook|floater|bank[s]?\s+it|midrange|baseline|puts it in|finishes)\b", re.I), 2),
    (re.compile(r"\b(makes|hits|knocks down|buries|drills|scores)\b", re.I), 2),
]

HAS_POINTS_RE   = re.compile(r"\bhas\s+(\d{1,2})\s+points?\b", re.I)
FIRST_FG_RE     = re.compile(r"\b(first|gets his first)\s+(bucket|field goal|fg)\b", re.I)
DOUBLE_TEAM_RE  = re.compile(r"\bdouble-?team(ed)?\b", re.I)
ACCENTS_RE_LIST = [
    re.compile(r"\bMVP\b", re.I),
    re.compile(r"\brebound(?:ing)?\b", re.I),
    re.compile(r"\bfinishing\b|\baround the rim\b", re.I),
]

def normalize_name_lookup(roster: List[str]) -> Dict[str, str]:
    lookup: Dict[str, str] = {}
    for full in roster:
        full_lower = full.lower()
        lookup[full_lower] = full
        parts = full_lower.split()
        if len(parts) >= 2:
            first, last = parts[0], parts[-1]
            lookup[first] = full
            lookup[last] = full
        else:
            lookup[parts[0]] = full
    return lookup

def find_named_player(text: str, name_lookup: Dict[str, str]) -> Optional[str]:
    text_lower = text.lower()
    for key, full in name_lookup.items():
        if " " in key and key in text_lower:
            return full
    tokens = re.findall(r"[A-Za-z]+", text_lower)
    for w in tokens:
        if w in name_lookup:
            return name_lookup[w]
    return None

def detect_points(text: str) -> int:
    for rx, pts in SCORING_RULES:
        if rx.search(text):
            return pts
    return 0


In [5]:
def analyze_commentary(
    transcript: List[Dict[str, str]],
    roster: List[str]
) -> Dict[str, PlayerStats]:
    name_lookup = normalize_name_lookup(roster)
    per_player = {p: PlayerStats(p) for p in roster}

    for row in transcript:
        raw_time = row.get("time", "")
        text = row.get("text", "")
        t = parse_time_any(raw_time) or parse_time_any(text)

        p = find_named_player(text, name_lookup)
        if not p:
            continue

        if FIRST_FG_RE.search(text):
            if per_player[p].first_fg_time is None:
                per_player[p].first_fg_time = t
            per_player[p].notes.append((t, "First field goal"))

        if DOUBLE_TEAM_RE.search(text):
            per_player[p].notes.append((t, "Facing double-teams"))

        for rx in ACCENTS_RE_LIST:
            if rx.search(text):
                per_player[p].notes.append((t, "Accolade or skill mention"))
                break

        pts = detect_points(text)
        if pts:
            per_player[p].total += pts
            per_player[p].timeline.append(Event(t, text, pts, "score"))

        m = HAS_POINTS_RE.search(text)
        if m:
            reported = int(m.group(1))
            if reported > per_player[p].total:
                per_player[p].total = reported
            per_player[p].timeline.append(Event(t, f"Reported total = {reported}", 0, "note"))

    return per_player


In [6]:
def leaders_by_commentary(per_player: Dict[str, PlayerStats]) -> List[PlayerStats]:
    if not per_player:
        return []
    max_pts = max(s.total for s in per_player.values())
    if max_pts <= 0:
        return []
    return [s for s in per_player.values() if s.total == max_pts]

def summarize_player(stats: PlayerStats) -> str:
    parts: List[str] = []
    parts.append(
        f"{stats.name}: {stats.total} points (based on commentary in the chosen segment; may be incomplete)."
    )

    if stats.first_fg_time is not None:
        parts.append(f"First field goal at {fmt_time(stats.first_fg_time)}.")

    threes = [e for e in stats.timeline if e.kind == "score" and e.points == 3 and e.t is not None]
    for i in range(len(threes) - 1):
        if threes[i + 1].t - threes[i].t <= 90:
            parts.append(
                f"Hits back-to-back threes around {fmt_time(threes[i].t)}–{fmt_time(threes[i+1].t)}."
            )
            break

    for t, note in sorted(
        stats.notes,
        key=lambda x: (x[0] if x[0] is not None else 10**9)
    )[:3]:
        if t is not None:
            parts.append(f"{note} ({fmt_time(t)}).")
        else:
            parts.append(f"{note}.")

    return " ".join(parts)

def answer_top_scorer_query(transcript, roster) -> str:
    per_player = analyze_commentary(transcript, roster)
    tops = leaders_by_commentary(per_player)
    if not tops:
        return "The commentary in this segment does not clearly establish a leading scorer."
    tops_sorted = sorted(tops, key=lambda s: s.name)
    header = "Top scorer(s) by commentary in this segment: " + ", ".join(
        f"{s.name} ({s.total} points)" for s in tops_sorted
    )
    bodies = "\n\n".join(summarize_player(s) for s in tops_sorted)
    caveat = "\n\nNote: This analysis uses commentary only and may not match the official box score."
    return header + "\n\n" + bodies + caveat

def answer_player_query(transcript, roster, player_name: str) -> str:
    per_player = analyze_commentary(transcript, roster)
    target = None
    for name in per_player:
        if name.lower() == player_name.lower() or name.split()[-1].lower() == player_name.lower():
            target = per_player[name]
            break
    if not target:
        return f"No clear commentary-based data found for {player_name} in this segment."
    return summarize_player(target)

def answer_query(query: str, transcript, roster) -> str:
    q = query.lower()
    if "scored the most" in q or "leading scorer" in q or "top scorer" in q:
        return answer_top_scorer_query(transcript, roster)
    m = re.search(r"analyze\s+the\s+player\s+(.+)", q)
    if m:
        player_guess = m.group(1).strip()
        return answer_player_query(transcript, roster, player_guess)
    return "I only support queries about the leading scorer or 'analyze the player <name>' for now."


In [7]:
ROSTER = [
    "LeBron James",
    "Kyrie Irving",
    "Kevin Love",
    "JR Smith",
    "Tristan Thompson",
    "Richard Jefferson",
    "Iman Shumpert",
    "Matthew Dellavedova",
    "Channing Frye",

    "Stephen Curry",
    "Klay Thompson",
    "Draymond Green",
    "Harrison Barnes",
    "Andre Iguodala",
    "Shaun Livingston",
    "Festus Ezeli",
    "Leandro Barbosa",
    "Andrew Bogut",
]


In [8]:
def load_game7_csv(path: str):
    transcript = []
    with open(path, newline='', encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) < 4:
                continue
            time_str = row[0].strip()
            text = row[3].strip()
            if not text:
                continue
            transcript.append({"time": time_str, "text": text})
    return transcript

csv_path = "[FULL GAME] Cleveland Cavaliers vs. Golden State Warriors  2016 NBA Finals Game 7  NBA on ESPN.csv"

TRANSCRIPT = load_game7_csv(csv_path)
len(TRANSCRIPT), TRANSCRIPT[:5]


(927,
 [{'time': '00:01',
   'text': 'welcome to Oakland California for Game seven of the 2016 NBA Finals where the'},
  {'time': '00:08',
   'text': 'Golden State Warriors had a comfortable 3-1 series lead two straight losses at their repeat'},
  {'time': '00:14',
   'text': 'hopes in some jeopardy meanwhile in Cleveland the Cavalier fans have hope'},
  {'time': '00:21',
   'text': 'again that this could be the year that the title drought comes to an end'},
  {'time': '00:26',
   'text': 'LeBron James and the Cavs with victories in games five and six to even this series at seven names of peace can they'}])

In [9]:
def filter_transcript_by_time(transcript, start_sec: int, end_sec: int):
    filtered = []
    for row in transcript:
        t = parse_time_any(row["time"])
        if t is None:
            continue
        if start_sec <= t <= end_sec:
            filtered.append(row)
    return filtered

TRANSCRIPT_SEGMENT = filter_transcript_by_time(TRANSCRIPT, 0, 10 * 60)
len(TRANSCRIPT_SEGMENT), TRANSCRIPT_SEGMENT[:5]


(153,
 [{'time': '00:01',
   'text': 'welcome to Oakland California for Game seven of the 2016 NBA Finals where the'},
  {'time': '00:08',
   'text': 'Golden State Warriors had a comfortable 3-1 series lead two straight losses at their repeat'},
  {'time': '00:14',
   'text': 'hopes in some jeopardy meanwhile in Cleveland the Cavalier fans have hope'},
  {'time': '00:21',
   'text': 'again that this could be the year that the title drought comes to an end'},
  {'time': '00:26',
   'text': 'LeBron James and the Cavs with victories in games five and six to even this series at seven names of peace can they'}])

In [10]:
print("Q: Analyze the player that scored the most in this game segment.\n")
print(answer_query(
    "Analyze the player that scored the most in this game",
    TRANSCRIPT_SEGMENT,   # or TRANSCRIPT
    ROSTER
))

Q: Analyze the player that scored the most in this game segment.

Top scorer(s) by commentary in this segment: Harrison Barnes (3 points), Kevin Love (3 points), Klay Thompson (3 points)

Harrison Barnes: 3 points (based on commentary in the chosen segment; may be incomplete).

Kevin Love: 3 points (based on commentary in the chosen segment; may be incomplete). Accolade or skill mention (7:50).

Klay Thompson: 3 points (based on commentary in the chosen segment; may be incomplete).

Note: This analysis uses commentary only and may not match the official box score.


In [11]:
print("Q: Analyze the player LeBron James.\n")
print(answer_query(
    "Analyze the player LeBron James",
    TRANSCRIPT_SEGMENT,   # or TRANSCRIPT
    ROSTER
))


Q: Analyze the player LeBron James.

LeBron James: 2 points (based on commentary in the chosen segment; may be incomplete). Accolade or skill mention (7:16).
