<a href="https://colab.research.google.com/github/ank2809/NLGS-BoxScore/blob/main/NLGS_BoxScore_ank2145.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Only if running in Colab




In [None]:
from google.colab import drive
drive.mount('/content/drive')

### **1. Data Processing**

All cells in this section have to be run for the model to work.

## 1.1 Reading the Data

The source can be 'rotowire' (professional summaries) or 'sbnation' (fan-written summaries). I primarily used RotoWire.

The path points to the directory containing the data.

In [None]:
import json

In [None]:
def read_data(source, path):
  # source is one of 'rotowire' and 'sbnation'
  # path points to the directory containing the data, 'drive/MyDrive' in this case.
  filer = open(path + source + '/train.json')
  data_train = json.load(filer)
  filer = open('drive/MyDrive/' + source + '/valid.json')
  data_valid = json.load(filer)
  filer = open('drive/MyDrive/' + source + '/test.json')
  data_test = json.load(filer)
  return data_train, data_valid, data_test

## 1.2 Parsing the Data

In [None]:
def extract_data(map, category, key):
  value = map[category][key]
  if value.isnumeric():
    return int(value)
  return 0

This is the primary function to create the labelled datasets from the raw data. X and Y are the datasets for the first stage of the model.
X2 and Y2 are the datasets for the second stage of the model.

In [None]:
common_fix = {'Stephen Curry': 'Steph Curry'}

def create_datasets(players, mentioned, X, Y, data, X2, Y2, max_rows = 10):
  ## Stores data for Stage 1 (SELECT) in X and Y. X is a list of vectors representing
  ## box score rows and Y is a list of corresponding binary labels

  ## Stores data for Stage 1 (GENERATE) in X2 and Y2. X2 is a list of box score
  ## data from each game that should be included in the summary. Y2 is a list
  ## actual game summaries

  ## We use max_rows to determine how many games we want to work with
  ## I have set it to a low number to ensure it's easy for TAs to verify the code.
  ## Otherwise, the second stage of the model will take hours to run.

  categories = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TO']
  count = 0

  for i in data:
    
    date = i['day'].split('_')
    year = date[2]
    month = date[0]
    day = date[1]
    date = year + '_' + month + '_' + day

    check = (int)(month)
    check2 = (int)(day)
    check3 = (int)(year)
    if (check < 10 and check > 4) or (check == 10 and check2 <= 25) or (check == 4 and check2 >= 15):
      continue
    count+=1
  
    # Uncomment these lines if you want to watch the function run
    # if count % 100 == 0:
    #   print(count)

    if count % max_rows == 0:
      break
    key = 'po'
    if check3 == 14 or (check3 == 15 and check < 6):
      key = '14/15'
    elif check3 == 15 or (check3 == 16 and check < 6):
      key = '15/16'
    elif check3 == 16 or (check3 == 17 and check < 6):
      key = '16/17'
  
    # The key and date are essential to map the box score data to the static
    # and dynamic data.
    
    team1 = i['home_city'] + ' ' + i['home_name']
    team2 = i['vis_city'] + ' ' + i['vis_name']

    team1_pts = i['home_line']['TEAM-PTS']
    team2_pts = i['vis_line']['TEAM-PTS']

    if (team2_pts > team1_pts):
      tmp = team1
      team1 = team2
      team2 = tmp

      tmp = team1_pts
      team1_pts = team2_pts
      team2_pts = tmp

    mentions = set(i['summary'])
    text = ' '.join(i['summary'])
    Y2.append(text)
    x2 = [[team1, team2, team1_pts, team2_pts, date, key]]
   
    player_names = i['box_score']['PLAYER_NAME']
  
    for k in player_names.keys():
      if len(X) != len(Y):
        print(count)
        break
      name = player_names[k]

      if name in common_fix:
        name = common_fix[name]

      if name not in players:
        mentioned[name] = 0
        players.add(name)

      x = [name]
      
      min = i['box_score']['MIN'][k]
      if not min.isnumeric():
        # Player did not play in the game
        for category in categories:
          x.append(0)
        x.append(key)
        Y.append(0)
        X.append(x)
        continue
      
      name_arr = name.split(' ')
      flag = True
      for mention in name_arr:
        if mention not in mentions:
          flag = False
          break
      
      check = True
      for category in categories:
        value = extract_data(i['box_score'], category, k)
        x.append(value)
        if flag and check and str(value) in mentions:
          # We use mentioned to track how often players are included in summaries
          mentioned[name]+=1
          Y.append(1)
          check = False
      x.append(key)
      X.append(x)
      if check:
        Y.append(0)
      else:
        x2.append(x)
    X2.append(x2)

  return index

In [None]:
data_train, data_valid, data_test = read_data('rotowire', 'drive/MyDrive/') ## REPLACE WITH CORRECT PATH
players = set()
mentioned = dict()
index = 0
X = []
Y = []
X2 = []
Y2 = []
index = create_datasets(players, mentioned, X, Y, data_train, X2, Y2)

## Uncomment these lines if we want to include all the data. Train itself has about
## 3200 games. The full set is about 4600 games.

# index = create_datasets(players, mentioned, X, Y, data_valid, X2, Y2)
# index = create_datasets(players, mentioned, X, Y, data_test, X2, Y2)


## 1.3 Create Placeholders for Static/Dynamic Data

This ensures the code runs without errors even if we don't use the static/dynamic data. If we end up using the data, it will be stored in these dictionaries.

In [None]:
draft = dict()
all_stars = dict()
final_streak = dict()
final_record = dict()
final_stats = dict()

### 2. **Create Static Data**

This section is optional and should only be run if we want to include static data in both stages of the model. If it is to be used, all cells in this section
must be run.

In [None]:
pip install basketball_reference_scraper

In [None]:
from basketball_reference_scraper.drafts import get_draft_class
from basketball_reference_scraper.box_scores import get_all_star_box_score

This creates a map of rookies/All-Stars per season, that can be accessed using the season and the player name.

In [None]:
conferences = ['East', 'West']
for i in range(len(seasons)):
  draft[seasons[i]] = set()
  players = get_draft_class(2014 + i)['PLAYER']
  for player in players:
    draft[seasons[i]].add(player)
  
  all_stars[seasons[i]] = set()
  players  = get_all_star_box_score(2014 + i)
  for conference in conferences:
    for player in players[conference]['PLAYER']:
      if player in common_fix:
        player = common_fix[player]
      all_stars[seasons[i]].add(player)

Optional Cells to view the data so far

In [None]:
draft

In [None]:
all_stars

### 3. **Create Dynamic Data**

This section is optional and should only be run if we want to include dynamic data in the second stage of the model. If it is to be used, all cells in this section must be run.

The next function maps a team's wins/losses to a specific date.

In [None]:
cat = ['PTS', 'REB', 'AST']
seasons = ['14/15', '15/16', '16/17']
def create_streaks(data, streaks, stats):

  
  count = 0
  for i in data:
   
    count+=1

    # If you want to keep track of how many games have been processed so far,
    # uncomment the next two lines.
    # if count % 100 == 0:
    #   print(count)

    team1 = i['home_city'] + ' ' + i['home_name']
    team2 = i['vis_city'] + ' ' + i['vis_name']

    team1_pts = i['home_line']['TEAM-PTS']
    team2_pts = i['vis_line']['TEAM-PTS']

    if (team2_pts > team1_pts):
      tmp = team1
      team1 = team2
      team2 = tmp

      tmp = team1_pts
      team1_pts = team2_pts
      team2_pts = tmp
  
    # Ensures team1 is the winning team and team2 is the losing team
    
    date = i['day'].split('_')
    year = date[2]
    month = date[0]
    day = date[1]
    date = year + '_' + month + '_' + day

    check = (int)(month)
    check2 = (int)(day)
    check3 = (int)(year)

    # We don't include preseason or playoff games
    if (check < 10 and check > 4) or (check == 10 and check2 <= 25) or (check == 4 and check2 >= 15):
      continue
    key = 'po'
    if check3 == 14 or (check3 == 15 and check < 6):
      key = '14/15'
    elif check3 == 15 or (check3 == 16 and check < 6):
      key = '15/16'
    elif check3 == 16 or (check3 == 17 and check < 6):
      key = '16/17'
    
    if key not in streaks:
      streaks[key] = dict()
    
    if team1 not in streaks[key]:
      streaks[key][team1] = dict()
    if team2 not in streaks[key]:
      streaks[key][team2] = dict()
    
    streaks[key][team1][date] = 1
    streaks[key][team2][date] = -1

    if key not in stats:
          stats[key] = dict()
    
    player_names = i['box_score']['PLAYER_NAME']
    for k in player_names.keys():
      name = player_names[k]

      if name in common_fix:
        name = common_fix[name]

      if name not in stats[key]:
        stats[key][name] = dict()
      
      min = i['box_score']['MIN'][k]
      if ((not min.isnumeric()) or min == 0):
        stats[key][name][date] = None
        continue
      
      result = []
      for c in cat:
        result.append(extract_data(i['box_score'], c, k))
      
      stats[key][name][date] = result

In [None]:
streaks = dict()
stats = dict()
create_streaks(data_train, streaks, stats)
create_streaks(data_valid, streaks, stats)
create_streaks(data_test, streaks, stats)

Optional Cells to view the data so far

In [None]:
streaks

The next function creates winning streaks and win-loss records based on the map of a team's results

In [None]:

for season, teams in streaks.items():
 
  for team, dates in teams.items():
    if team not in final_streak:
      final_streak[team] = dict()
    if team not in final_record:
      final_record[team] = dict()
    count = 0
    w = 0
    l = 0
    order = list(dates.keys())
    order.sort()
    for date in order:
      result = dates[date]
      if count * result > 0:
        count += result
      else:
        count = result
      
      if result > 0:
        w += 1
      else:
        l += 1
      final_record[team][date] = (w, l)
      final_streak[team][date] = count

Optional Cells to view the data so far

In [None]:
final_record

In [None]:
final_streak

The following dynamic data is not used because it is not effective. You can still run the code to see how it's calculated.

In [None]:
for season, players in stats.items():
 
  for player, dates in players.items():
    if player not in final_streak:
      final_stats[player] = dict()
    count = 0
    order = list(dates.keys())
    order.sort()
    p = 0
    r = 0
    a = 0
    g = 0
    avg_p = None
    avg_r = None
    avg_a = None
    for date in order:
      result = dates[date]
      t1 = (avg_p, avg_r, avg_a)
      if result != None:
        g += 1
        p += result[0]
        a += result[1]
        r += result[2]
        avg_p = p/g
        avg_r = r/g
        avg_a = a/g
      
      t2 = (avg_p, avg_r, avg_a)
      final_stats[player][date] = (t1, t2)
        

In [None]:
final_stats

### 4. **Stage 1 of Model: SELECT**

This section includes the specific pre-processing and running of the first stage of the model. Both stages of the model can be trained and evaluated independently.

This stage includes the binary classifier to pick which rows of the box_score should be included in the summary

Optional if you want to check the data before running the code.



In [None]:
X[0]

## 4.1 Data Processing

This section converts the player names to categorical ids before training the binary classifier on the input vectors.

Rather than using a random categorical variable, the categorical id has an inverse relationship with how often a player is mentioned in the summary. Players who are mentioned more often have higher ids.

In [None]:
# Counts how often a player is mentioned in the summary
reverse_mention = dict()
for player in mentioned.keys():
  count = mentioned[player]
  if count not in reverse_mention:
    reverse_mention[count] = [player]
  else:
    reverse_mention[count].append(player)

In [None]:
keys = list(reverse_mention.keys())
keys.sort()


In [None]:
# Creates mapping of player name to id.
index = 0
final_map = dict()
reverse_final_map = dict()
for key in keys:
  for player in reverse_mention[key]:
    final_map[player] = index
    reverse_final_map[index] = player
    index += 1

In [None]:
# Add static data if present and replaces the player name with a categorical id
for i in range(len(X)):
  season = X[i][7]
  X[i].pop()
  name = X[i][0]
  rookie = 0
  all_star = 0
  if name in draft[season]:
    rookie = 1
  if name in all_stars[season]:
    all_star = 1
  X[i].extend([rookie, all_star])
  X[i][0] = final_map[X[i][0]]

In [None]:
# Converts the data to an array before passing it to the classifier
import numpy as np
X_arr = np.asarray(X)
Y_arr = np.asarray(Y)
print(X_arr.shape, Y_arr.shape)

## 4.2 Binary Classifier

This stage of the model involves training the model on various classifiers and evaluating to see what's the best. Random Forest was marginally the best classifier.

Train/Test split of 80/20

# **CITATIONS**

* https://maciejzalwert.medium.com/decision-tree-random-forest-and-xgboost-demystified-with-python-code-7060621eb783
* https://scikit-learn.org/stable/

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_arr, Y_arr, test_size = 0.2, random_state=42)

For Decision Trees we try to find the max depth for optimal fitting

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
max_score = 0
n_est = 0
for i in range (1, 21):
  classifier = DecisionTreeClassifier(max_depth = i)
  classifier.fit(X_train, Y_train)
  score = classifier.score(X_test, Y_test)
  if (score > max_score):
    max_score = score
    n_est = i
print(n_est, max_score)

For Random Forest we try to find the max depth and max features for optimal fitting

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
max_features = ['sqrt', 'log2', None]
max_score = 0
n_est = 0
feature = ''
for i in range (1, 21):

  for max_feature in max_features:
    classifier = RandomForestClassifier(max_depth = i, max_features=max_feature)
    classifier.fit(X_train, Y_train)
    score = classifier.score(X_test, Y_test)
    if score > max_score:
      max_score = score
      feature = max_feature
      n_est = i
print(n_est, feature, max_score)

For XGBoost we use the xgboost package itself to find the optimal record.

In [None]:
!pip install xgboost
import xgboost as xgb

In [None]:
classifer = xgb.XGBClassifier()
classifier.fit(X_train, Y_train)
classifier.score(X_test, Y_test)

## 4.3 Data Post-Processing

If we want to run both stages of the model we need to convert the categorical features back to player names for the text generation.

In [None]:
for i in range(len(X)):
  X[i].pop()
  X[i].pop()
  X[i][0] = reverse_final_map[X[i][0]]

### 5. Stage 2 of the Model (GENERATE)

This section includes the specific pre-processing and running of the second stage of the model. Both stages of the model can be trained and evaluated independently.

This stage includes the pre-trained large language model to generate text from a given box score.




Optional if you want to check the data before running the code.

In [None]:
X2[0]

## 5.1 Data Pre-processing

Here we convert box score data to dummy summaries so we can work in a sequence-to-sequence setting with only lexical data.

The following two functions create dummy sentences out of the box score data.

In [None]:
def get_record(record):
  if record is None:
    return ''
  return " (" + str(record[0]) + "-" + str(record[1]) + ") "

In [None]:
# If static and dynamic data is not being used, these parameters should be set to False
def convert_dummy(teams, x2, use_static = True, use_dynamic = True):
  date = teams[4]
  if use_static:
    drft = draft[teams[5]]
    allstr = all_stars[teams[5]]
  else:
    drft = []
    allstr = []
    
  if use_dynamic:
    win_record = final_record[teams[0]][date]
    loss_record = final_record[teams[1]][date]
    win_streak = final_streak[teams[0]][date]
    loss_streak = final_streak[teams[1]][date]
  else:
    win_record = None
    loss_record = None
    win_streak = 0
    loss_streak = 0

  summary = "The " + teams[0] + get_record(win_record) + " defeated the " + teams[1] + get_record(loss_record) + " " + str(teams[2]) + " - " + str(teams[3]) + ' . '

  if win_streak >= 2:
    summary += teams[0] + " are on a " + str(win_streak) + " game winning streak. "
  
  if loss_streak <= -2:
    summary += teams[1] + " are on a " + str(-loss_streak) + " game losing streak. "
  
  for row in x2:
    name = row[0]
    if name in allstr:
      name = 'All-Star ' + name
      
    elif name in drft:
      name = 'rookie ' + name
    
    summary += ' ' + name + ' had ' 
    # We always include points, rebounds, assists
    # We only include steals, blocks, turnovers 
    # when they're above 5
    # I found these are the optimal thresholds to reduce input size without
    # losing model results
    if row[1] >= 0:
      summary += str(row[1]) + ' points ' + ' , ' 
    if row[2] >= 0:
      summary += str(row[2]) + ' rebounds ' + ' , ' 
    if row[3] >= 0:
      summary += str(row[3]) + ' assists ' + ' , '
    if row[4] >= 5:
      summary += str(row[4]) + ' steals ' + ' , '
    if row[5] >= 5:
      summary += str(row[5]) + ' blocks ' + ' , '
    if row[6] >= 5:
      summary += str(row[6]) + ' turnovers '
    summary +=  '.'
  return summary

In [None]:
# create a list of dummy summaries to be fed into the LLM
X2_processed = []
for i in range(len(X2)):
  X2_processed.append(convert_dummy(X2[i][0], X2[i][1:], False, False))
print(len(X2_processed))

Optional if you want to see the results before running the second stage of the model

In [None]:
X2_processed[1]

In [None]:
Y2[1]

## 5.2 Large Language Model

Here we use a pre-trained large language model to generate human-like game summaries from our dummy summaries

# **CITATIONS**
* https://huggingface.co/docs/transformers/
* https://huggingface.co/transformers/v3.3.1/pretrained_models.html
* https://github.com/huggingface/evaluate
* https://docs.fast.ai/learner.html

In [None]:
# imports
!pip install ohmeow-blurr -q
!pip install bert-score -q
!pip install nltk

import pandas as pd
from fastai.text.all import *
from transformers import *
from blurr.text.data.all import *
from blurr.text.modeling.all import *
import nltk
nltk.download('punkt')

# START - SOURCE_REPO = https://github.com/ohmeow/blurr

In [None]:
def get_llm(base = 'BART', max_len = 256, max_tgt_len = 256, prefix = True, task = 'summarization', verbose = True, batch_size=2):

  # Download pre-trained model
  if base == 'BART':
    pretrained_model_name = "facebook/bart-base"
    hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(pretrained_model_name, 
                                                                  model_cls=BartForConditionalGeneration)
  elif base == 'T5':
    if task == 'summarization':
      prefix = False
    pretrained_model_name = "t5-base"
    hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(pretrained_model_name, 
                                                                  model_cls=T5ForConditionalGeneration)
  else:
    return None

  # Initialize arguments for pre-trained model
  
  text_gen_kwargs = default_text_gen_kwargs(hf_config, hf_model, task=task)
  text_gen_kwargs['max_length'] = max_len
  if not prefix:
    text_gen_kwargs.pop('prefix')
  if verbose:
    print('Args', text_gen_kwargs)
  
  hf_batch_tfm = Seq2SeqBatchTokenizeTransform(
    hf_arch, hf_config, hf_tokenizer, hf_model, max_length=max_len, max_tgt_length=max_tgt_len, text_gen_kwargs=text_gen_kwargs
  )
  seq2seq_metrics = {
        'rouge': {
            'compute_kwargs': { 'rouge_types': ["rouge1", "rouge2", "rougeL"], 'use_stemmer': True },
            'returns': ["rouge1", "rouge2", "rougeL"]
        },
        'bertscore': {
            'compute_kwargs': { 'lang': 'en' },
            'returns': ["precision", "recall", "f1"]
        },
        'bleu': {"returns": "bleu"}
  } 

  # Use FastAi's dataloader to efficiently access the data during training
  df_X2 = pd.DataFrame(X2_processed, columns=['dummy'])
  df_Y2 = pd.DataFrame(Y2, columns=['summary'])
  df_data = pd.concat([df_X2, df_Y2], axis=1)
  
  blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=hf_batch_tfm), noop)
  dblock = DataBlock(blocks=blocks, get_x=ColReader('dummy'), get_y=ColReader('summary'), splitter=RandomSplitter())

  dls = dblock.dataloaders(df_data, bs=batch_size)

  if verbose:
    dls.show_batch(dataloaders=dls, max_n=2)

  ## Initialize pre-trained model

  model = BaseModelWrapper(hf_model)
  learn_cbs = [BaseModelCallback]
  fit_cbs = [Seq2SeqMetricsCallback(custom_metrics=seq2seq_metrics)]

  learn = Learner(dls, 
                  model,
                  opt_func=ranger,
                  loss_func=CrossEntropyLossFlat(),
                  cbs=learn_cbs,
                  splitter=partial(blurr_seq2seq_splitter, arch=hf_arch)).to_fp16()

  learn.create_opt() 
  learn.freeze()

  return learn, fit_cbs

In [None]:
learn, fit_cbs = get_llm(base = 'BART', max_len = 256, max_tgt_len = 256, prefix = True, task = 'summarization', verbose = False, batch_size=2)

We use FastAI's fit_one_cycle method to train and evaluate the model. It does a train/test(val) split of 80/20 and displays the results for the metrics we pass to it.

In [None]:
# Train model
learning_rate = 3e-5
epochs = 1 # For ease of checking the code
learn.fit_one_cycle(epochs, lr_max=learning_rate, cbs=fit_cbs)

In [None]:
# See sample output
learn.show_results(learner=learn, max_n=2)

# END - SOURCE REPO = https://github.com/ohmeow/blurr

## 5.3 BERT Model

# START - SOURCE REPO = https://huggingface.co/docs/transformers/model_doc/bert-generation

I also experimented with a BERT model, but the loss never reduced so I never completed it. You can still try the training loop.

In [None]:
from transformers import BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel, BertTokenizer

encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased", bos_token_id=101, eos_token_id=102)
decoder = BertGenerationDecoder.from_pretrained("bert-base-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
avg_loss = 0
count = 20
bert2bert.train()
for i in range(len(X2_processed)):
  input_ids = tokenizer(X2_processed[i], add_special_tokens=True, return_tensors="pt").input_ids
  labels = tokenizer(Y2[i], add_special_tokens=True, return_tensors="pt").input_ids
  try:
    loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
    avg_loss += loss.item()
    loss.backward()
  except RuntimeError:
    count -= 1

  if (i+1)%20 == 0:
    print(i)
    print('Loss', avg_loss/count)
    avg_loss = 0
    count = 20
  
  
  


In [None]:
input_ids = tokenizer(X2_processed[1000], add_special_tokens=True, return_tensors="pt").input_ids
bert2bert.generate(input_ids, decoder_start_token_id=bert2bert.config.decoder.pad_token_id)

In [None]:
tokenizer.decode(bert2bert.generate(input_ids, decoder_start_token_id=bert2bert.config.decoder.pad_token_id)[0])

# END - SOURCE REPO = https://huggingface.co/docs/transformers/model_doc/bert-generation