In [None]:
# imports
import os
import json
import pandas as pd
%pip install SoccerNet
from SoccerNet.Downloader import SoccerNetDownloader
%pip install datasets
from datasets import load_dataset
from tqdm import tqdm
from collections import Counter
import re
import string
%pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [None]:
# soccernet downloads

mySoccerNetDownloader = SoccerNetDownloader(LocalDirectory="./soccernet")

mySoccerNetDownloader.downloadGames(files=["Labels-v2.json"], split=["train", "valid", "test"]) # labels
# mySoccerNetDownloader.downloadDataTask(task="spotting-2023", split=["train", "valid", "test", "challenge"]) # highlight labels for 2023 challenge task
# mySoccerNetDownloader.downloadGames(files=["1_ResNET_TF2.npy", "2_ResNET_TF2.npy"], split=["train", "valid", "test"]) # visual features (not needed)

# mySoccerNetDownloader.password = "Password from NDA" # TODO: we need access
# videos:
# mySoccerNetDownloader.downloadGames(files=["1_224p.mkv", "2_224p.mkv"], split=["train", "valid", "test"]) # 224p resolution
# mySoccerNetDownloader.downloadGames(files=["1_720p.mkv", "2_720p.mkv"], split=["train", "valid", "test"]) # 720p resolution

In [17]:
"""
    helper function to convert position (milliseconds) to seconds and extract the half from gameTime

    args:
        position (str): the position in milliseconds as a string
        game_time (str): the gameTime string in the format "1 - mm:ss" or "2 - mm:ss"

    returns:
        tuple: a tuple containing:
            - time_in_seconds (float): the position converted to seconds
            - half (int): the half of the game (1 or 2)
"""
def position_to_seconds_and_half(position, game_time):
    time_in_seconds = float(position) / 1000
    half = int(game_time.split(" - ")[0]) # extract the half from gameTime
    return time_in_seconds, half

In [18]:
# initialize list to hold all data
all_data = []

for root, dirs, files in os.walk('soccernet'):
    for file in files:
        if file == "Labels-v2.json":
            file_path = os.path.join(root, file)
            with open(file_path, "r") as f:
                labels_data = json.load(f)

            match_name = labels_data.get("UrlLocal", "")  # match_name is from the UrlLocal key
            annotations = labels_data.get("annotations", []) # events from the "annotations" key

            for label_entry in annotations:
                # convert position value from milliseconds to seconds
                time_in_seconds, half = position_to_seconds_and_half(label_entry["position"], label_entry["gameTime"])
                match_name_with_half = f"{match_name}{half}" # match transcript "game" name
                label = label_entry["label"]
                all_data.append({
                    "time_in_seconds": time_in_seconds,
                    "match_name": match_name_with_half,
                    "label": label
                })

# convert the list to dataframe and save to csv
labels_df = pd.DataFrame(all_data)
labels_df.to_csv("labels_df.csv", index=False)
print(labels_df.head())

   time_in_seconds                                         match_name  \
0            0.000  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   
1          133.295  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   
2          149.168  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   
3          156.098  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   
4          182.775  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   

              label  
0          Kick-off  
1  Ball out of play  
2          Throw-in  
3  Ball out of play  
4            Corner  


In [19]:
# load transcript dataset
transcript_dataset = load_dataset("SoccerNet/SN-echoes", 
                      name="whisper_v3", 
                      split="en") # load only the English split
transcripts_df = pd.DataFrame(transcript_dataset)

In [20]:
transcripts_df['label'] = 'non-highlight'

# update label based on time and game matching conditions
for idx, row in tqdm(transcripts_df.iterrows(), total=len(transcripts_df), desc="Processing rows"):
    matching_label = labels_df[
        (labels_df['time_in_seconds'] >= row['start_time']) &
        (labels_df['time_in_seconds'] <= row['end_time']) &
        (labels_df['match_name'] == row['game'])
    ]
    if not matching_label.empty: # found a match
        transcripts_df.at[idx, 'label'] = matching_label.iloc[0]['label']

transcripts_df.to_csv("transcripts_df.csv", index=False)
print(transcripts_df.head())

Processing rows: 100%|██████████| 679738/679738 [1:26:44<00:00, 130.62it/s]


   segment_index  start_time    end_time  \
0              0   30.000000   59.980000   
1              1   60.000000   72.440002   
2              2   73.260002   89.199997   
3              3   90.000000  106.180000   
4              4  106.180000  117.360001   

                                                text  \
0                         and everything is possible   
1  Felipe Luis for Azpilicueta and Zouma for Keji...   
2  César opened towards Eden Hazard, Hazard who w...   
3  Cuadrado left again for Hazard, there is Juan ...   
4  and there are quite a few, not only this seaso...   

                                                game          label  
0  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  
1  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  
2  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  
3  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  
4  england_epl/2014-2015/2015-02-21 - 18-00 Chel

In [None]:
# data analysis
label_counts = labels_df['label'].value_counts()
print(label_counts)
transcripts_label_counts = transcripts_df['label'].value_counts()
print(transcripts_label_counts)

# get unique values from both columns
games_in_transcripts = set(transcripts_df['game'].unique())
match_names_in_labels = set(labels_df['match_name'].unique())

# find differences
only_in_transcripts = games_in_transcripts - match_names_in_labels # values only in transcripts_df
only_in_labels = match_names_in_labels - games_in_transcripts # values only in labels_df
common_games = games_in_transcripts & match_names_in_labels # common values in both

# print results
print("Values only in transcript:")
for value in only_in_transcripts:
    print(value)

print("\nValues only in labels:")
for value in only_in_labels:
    print(value)

print("\nValues in both columns:")
for value in common_games:
    print(value)


# TODO: we are losing a lot of highlights, we need to investigate why

label
Ball out of play      31810
Throw-in              18918
Foul                  11674
Indirect free-kick    10521
Clearance              7896
Shots on target        5820
Shots off target       5256
Corner                 4836
Substitution           2839
Kick-off               2566
Direct free-kick       2200
Offside                2098
Yellow card            2047
Goal                   1703
Penalty                 173
Red card                 55
Yellow->red card         46
Name: count, dtype: int64
label
non-highlight         624183
Ball out of play       15204
Throw-in                9108
Foul                    6162
Indirect free-kick      5456
Clearance               4208
Shots on target         3124
Shots off target        2771
Corner                  2549
Substitution            1509
Kick-off                1469
Direct free-kick        1207
Offside                 1164
Yellow card             1093
Goal                     425
Penalty                   53
Red card              

In [25]:
filtered_transcripts_df = transcripts_df[transcripts_df['game'].isin(common_games)] # only keep games that have highlight labels AND a transcript
filtered_transcripts_df.to_csv("filtered_transcripts.csv", index=False)

In [26]:
# data analytics
filtered_transcripts_label_counts = filtered_transcripts_df['label'].value_counts()
print(filtered_transcripts_label_counts)

label
non-highlight         559933
Ball out of play       15204
Throw-in                9108
Foul                    6162
Indirect free-kick      5456
Clearance               4208
Shots on target         3124
Shots off target        2771
Corner                  2549
Substitution            1509
Kick-off                1469
Direct free-kick        1207
Offside                 1164
Yellow card             1093
Goal                     425
Penalty                   53
Red card                  29
Yellow->red card          24
Name: count, dtype: int64


In [40]:
# importance ranking for highlight labels (non-highlight', 'Ball out of play', 'Throw-in are ignored)
highlight_label_ranks = {
    "Goal": 1,
    "Penalty": 2,
    "Red card": 3,
    "Yellow->red card": 4,
    "Yellow card": 5,
    "Direct free-kick": 6,
    "Indirect free-kick": 7,
    "Foul": 8,
    "Substitution": 9,
    "Shots on target": 10,
    "Shots off target": 11,
    "Corner": 12,
    "Clearance": 13,
    "Offside": 14,
    "Kick-off": 15,
}

# helper function to determine majority label (if any)
def determine_label(labels):
    highlight_labels = [label for label in labels if label not in ['non-highlight', 'Ball out of play', 'Throw-in']] # checks for any label that is not non-highlight, ball out of play, or throw-in
    if highlight_labels:
        # find the most common label(s)
        label_counts = Counter(highlight_labels)
        most_common_labels = label_counts.most_common()
        max_count = most_common_labels[0][1]
        tied_labels = [label for label, count in most_common_labels if count == max_count]
        if len(tied_labels) > 1:
            # if there are tied labels, return the one with the highest rank (lowest rank number)
            tied_labels.sort(key=lambda x: highlight_label_ranks[x])
            return tied_labels[0]
        else:
            return most_common_labels[0][0] # no tie, return the most common label
    else:
        return 'non-highlight' # if no highlights, return non-highlight (set ball out of play, throw-in as non-highlight)

    

In [41]:
# helper function to combine rows in filtered_transcripts within a 10 second context window
def combine_into_10_seconds(df):
    combined_data = []
    
    for game, group in df.groupby('game'): # process each game separately
        group = group.sort_values('start_time').reset_index(drop=True)  # sort by start_time (prob not needed)
        temp_text = []
        temp_labels = []
        start_time = group.loc[0, 'start_time']
        current_end_time = start_time + 10
        
        for idx, row in group.iterrows():
            if row['start_time'] <= current_end_time: # within the 10 second window
                temp_text.append(row['text'])
                temp_labels.append(row['label'])
                current_end_time = max(current_end_time, row['end_time'])
            else:
                final_label = determine_label(temp_labels) # save the combined chunk (max 10 seconds)
                combined_data.append({
                    'start_time': start_time,
                    'end_time': current_end_time,
                    'text': " ".join(temp_text),
                    'game': game,
                    'label': final_label
                })
                # reset for the next chunk - first row outside the 10 second window
                temp_text = [row['text']]
                temp_labels = [row['label']]
                start_time = row['start_time']
                current_end_time = start_time + 10
        
        # save the last chunk (any leftover text after the last 10 second window)
        if temp_text:
            final_label = determine_label(temp_labels)
            combined_data.append({
                'start_time': start_time,
                'end_time': current_end_time,
                'text': " ".join(temp_text),
                'game': game,
                'label': final_label
            })
    
    return pd.DataFrame(combined_data)

In [42]:
# 10 second context window:
temp_df = filtered_transcripts_df.copy()
combined_10_seconds_df = combine_into_10_seconds(temp_df)
combined_10_seconds_df.to_csv('combined_10_seconds.csv', index=False)
print(combined_10_seconds_df.head())


   start_time    end_time                                               text  \
0   30.000000   59.980000                         and everything is possible   
1   60.000000   70.000000  Felipe Luis for Azpilicueta and Zouma for Keji...   
2   73.260002   83.260002  César opened towards Eden Hazard, Hazard who w...   
3   90.000000  100.000000  Cuadrado left again for Hazard, there is Juan ...   
4  106.180000  116.180000  and there are quite a few, not only this seaso...   

                                                game          label  
0  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  
1  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  
2  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  
3  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  
4  england_epl/2014-2015/2015-02-21 - 18-00 Chels...  non-highlight  


In [43]:
# data analytics
combined_10_seconds_label_counts = combined_10_seconds_df['label'].value_counts()
print(combined_10_seconds_label_counts)

label
non-highlight         51768
Indirect free-kick     4628
Foul                   3572
Clearance              2570
Shots on target        2192
Shots off target       1992
Corner                 1529
Substitution           1082
Direct free-kick       1009
Kick-off               1005
Yellow card             901
Offside                 613
Goal                    380
Penalty                  46
Red card                 24
Yellow->red card         21
Name: count, dtype: int64


In [47]:
# helper function to clean text
def clean_text(text):
    text = text.lower() # convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text) # remove special characters (if any)
    text = ' '.join(text.split()) # remove extra spaces
    return text

In [49]:
# Apply to your dataframe
combined_10_seconds_df_cleaned = combined_10_seconds_df.copy()
combined_10_seconds_df_cleaned['cleaned_text'] = combined_10_seconds_df['text'].apply(clean_text)
combined_10_seconds_df_cleaned.drop(columns=['text'], inplace=True)
combined_10_seconds_df_cleaned.to_csv('combined_10_seconds_cleaned.csv', index=False)
print(combined_10_seconds_df_cleaned.head())

   start_time    end_time                                               game  \
0   30.000000   59.980000  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   
1   60.000000   70.000000  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   
2   73.260002   83.260002  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   
3   90.000000  100.000000  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   
4  106.180000  116.180000  england_epl/2014-2015/2015-02-21 - 18-00 Chels...   

           label                                       cleaned_text  
0  non-highlight                         and everything is possible  
1  non-highlight  felipe luis for azpilicueta and zouma for keji...  
2  non-highlight  csar opened towards eden hazard hazard who was...  
3  non-highlight  cuadrado left again for hazard there is juan c...  
4  non-highlight  and there are quite a few not only this season...  


In [51]:
# fix massive class imbalance (remove 31000 non-highlights to make it roughly equal to total number of highlights)
non_highlights = combined_10_seconds_df_cleaned[combined_10_seconds_df_cleaned['label'] == 'non-highlight']
non_highlights_to_remove = non_highlights.sample(n=31000, random_state=8)
combined_10_seconds_df_cleaned_reduced = combined_10_seconds_df_cleaned.drop(non_highlights_to_remove.index)
combined_10_seconds_df_cleaned_reduced.to_csv('combined_10_seconds_cleaned_reduced.csv', index=False)
combined_10_seconds_df_cleaned_reduced.head()


Unnamed: 0,start_time,end_time,game,label,cleaned_text
3,90.0,100.0,england_epl/2014-2015/2015-02-21 - 18-00 Chels...,non-highlight,cuadrado left again for hazard there is juan c...
5,117.360001,147.619995,england_epl/2014-2015/2015-02-21 - 18-00 Chels...,non-highlight,fantastic figure for a side he has once been a...
6,150.0,160.0,england_epl/2014-2015/2015-02-21 - 18-00 Chels...,non-highlight,he is a player who is one of the best signings...
7,180.0,190.0,england_epl/2014-2015/2015-02-21 - 18-00 Chels...,Corner,he is a player who is one of the best signings...
9,227.119995,237.119995,england_epl/2014-2015/2015-02-21 - 18-00 Chels...,non-highlight,lets see why keitly stole that ball the barley...


In [52]:
# data analytics
combined_10_seconds_df_cleaned_reduced_counts = combined_10_seconds_df_cleaned_reduced['label'].value_counts()
print(combined_10_seconds_df_cleaned_reduced_counts)

label
non-highlight         20768
Indirect free-kick     4628
Foul                   3572
Clearance              2570
Shots on target        2192
Shots off target       1992
Corner                 1529
Substitution           1082
Direct free-kick       1009
Kick-off               1005
Yellow card             901
Offside                 613
Goal                    380
Penalty                  46
Red card                 24
Yellow->red card         21
Name: count, dtype: int64


In [None]:
# split the dataset into training, validation, and test sets and fit TfidfVectorizer to get tf-idf embeddings
X_train, X_temp, y_train, y_temp = train_test_split(combined_10_seconds_df_cleaned_reduced['cleaned_text'], combined_10_seconds_df_cleaned_reduced['label'], test_size=0.2, random_state=8)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=8)

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

In [57]:
# random forest classifier (cross validation)
rf_model = RandomForestClassifier(n_estimators=100, random_state=8)
cv_scores = cross_val_score(rf_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: Mean = {cv_scores.mean()}, Std = {cv_scores.std()}")

Cross-validation Accuracy: Mean = 0.4999852354938727, Std = 0.0014870430606247495


In [58]:
# random forest classifier (fit on entire training set)
rf_model.fit(X_train_tfidf, y_train)
y_pred = rf_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

         Clearance       0.00      0.00      0.00       263
            Corner       0.00      0.00      0.00       148
  Direct free-kick       0.00      0.00      0.00        94
              Foul       0.19      0.01      0.02       348
              Goal       0.00      0.00      0.00        44
Indirect free-kick       0.23      0.15      0.18       464
          Kick-off       0.33      0.01      0.02       101
           Offside       0.50      0.03      0.06        61
           Penalty       0.00      0.00      0.00         2
          Red card       0.00      0.00      0.00         3
  Shots off target       0.00      0.00      0.00       198
   Shots on target       0.33      0.00      0.01       210
      Substitution       0.00      0.00      0.00       110
       Yellow card       0.36      0.06      0.10        87
  Yellow->red card       0.00      0.00      0.00         2
     non-highlight       0.53      0.99

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
