In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import json

In [2]:
event_dict_file = Path("..", "resources", "data", "dictionary.yaml")
with open(event_dict_file, "r") as ed_file:
    event_dict = yaml.safe_load(ed_file)

In [3]:
print(event_dict)

{'event_type': {0: 'Announcement', 1: 'Attempt', 2: 'Corner', 3: 'Foul', 4: 'Yellow card', 5: 'Second yellow card', 6: 'Red card', 7: 'Substitution', 8: 'Free kick', 9: 'Offside', 10: 'Hand ball', 11: 'Penalty'}, 'event_type2': {12: 'Key Pass', 13: 'Failed through ball', 14: 'Sending off', 15: 'Own goal'}, 'side': {1: 'Home', 2: 'Away'}, 'shot_place': {1: 'Bit too high', 2: 'Blocked', 3: 'Bottom left corner', 4: 'Bottom right corner', 5: 'Centre of the goal', 6: 'High and wide', 7: 'Hits the bar', 8: 'Misses to the left', 9: 'Misses to the right', 10: 'Too high', 11: 'Top centre of the goal', 12: 'Top left corner', 13: 'Top right corner'}, 'shot_outcome': {1: 'On target', 2: 'Off target', 3: 'Blocked', 4: 'Hit the bar'}, 'location': {1: 'Attacking half', 2: 'Defensive half', 3: 'Centre of the box', 4: 'Left wing', 5: 'Right wing', 6: 'Difficult angle and long range', 7: 'Difficult angle on the left', 8: 'Difficult angle on the right', 9: 'Left side of the box', 10: 'Left side of the si

In [4]:
events_file = Path("..", "resources", "data", "events.csv")
events = pd.read_csv(events_file)

In [5]:
ginf_file = Path("..", "resources", "data", "ginf.csv")
ginf = pd.read_csv(ginf_file)

In [6]:
len(events["id_odsp"].unique())

9074

In [7]:
combined_data = events.merge(ginf, on="id_odsp", suffixes=("_events", "_ginf"))

In [8]:
combined_data.drop(columns=["adv_stats", "link_odsp"], inplace=True)
combined_data.drop(columns=[col for col in combined_data.columns if col.startswith("odd_")], inplace=True)
combined_data.columns

Index(['id_odsp', 'id_event', 'sort_order', 'time', 'text', 'event_type',
       'event_type2', 'side', 'event_team', 'opponent', 'player', 'player2',
       'player_in', 'player_out', 'shot_place', 'shot_outcome', 'is_goal',
       'location', 'bodypart', 'assist_method', 'situation', 'fast_break',
       'date', 'league', 'season', 'country', 'ht', 'at', 'fthg', 'ftag'],
      dtype='object')

In [9]:
pd.set_option("display.max_columns", None)

In [10]:
combined_data.describe(include="all")

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,player,player2,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break,date,league,season,country,ht,at,fthg,ftag
count,941009,941009,941009.0,941009.0,941009,941009.0,214293.0,941009.0,941009,941009,880009,291310,51715,51738,227459.0,228498.0,941009.0,467067.0,229185.0,941009.0,229137.0,941009.0,941009,941009,941009.0,941009,941009,941009,941009.0,941009.0
unique,9074,941009,,,366076,,,,142,142,6118,5747,5093,4671,,,,,,,,,923,5,,5,142,142,,
top,ITbfCc8F/,UFot0hit1,,,Foul by Perparim Hetemaj (Chievo).,,,,Juventus,Genoa,cristiano ronaldo,gonzalo castro,adrian,gonzalo castro,,,,,,,,,2015-04-04,I1,,italy,Genoa,Genoa,,
freq,180,1,,,396,,,,12054,11981,1872,641,147,103,,,,,,,,,3454,227127,,227127,11676,11779,,
mean,,,53.858826,49.663663,,4.326575,12.233764,1.48117,,,,,,,5.733693,1.926555,0.025978,6.209073,1.624831,0.264332,1.281316,0.004876,,,2014.405073,,,,1.54314,1.158935
std,,,32.014268,26.488977,,2.995313,0.46885,0.499646,,,,,,,3.3261,0.797055,0.159071,5.421736,0.7404,0.655501,0.709394,0.069655,,,1.59443,,,,1.296907,1.136129
min,,,1.0,0.0,,1.0,12.0,1.0,,,,,,,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,,,2012.0,,,,0.0,0.0
25%,,,27.0,27.0,,2.0,12.0,1.0,,,,,,,2.0,1.0,0.0,2.0,1.0,0.0,1.0,0.0,,,2013.0,,,,1.0,0.0
50%,,,53.0,51.0,,3.0,12.0,1.0,,,,,,,5.0,2.0,0.0,3.0,1.0,0.0,1.0,0.0,,,2014.0,,,,1.0,1.0
75%,,,79.0,73.0,,8.0,12.0,2.0,,,,,,,9.0,3.0,0.0,11.0,2.0,0.0,1.0,0.0,,,2016.0,,,,2.0,2.0


In [11]:
for column_name, mapping in event_dict.items():
    print(column_name)
    mapping = {int(ev): ev_name for ev, ev_name in mapping.items()}
    print(mapping)

event_type
{0: 'Announcement', 1: 'Attempt', 2: 'Corner', 3: 'Foul', 4: 'Yellow card', 5: 'Second yellow card', 6: 'Red card', 7: 'Substitution', 8: 'Free kick', 9: 'Offside', 10: 'Hand ball', 11: 'Penalty'}
event_type2
{12: 'Key Pass', 13: 'Failed through ball', 14: 'Sending off', 15: 'Own goal'}
side
{1: 'Home', 2: 'Away'}
shot_place
{1: 'Bit too high', 2: 'Blocked', 3: 'Bottom left corner', 4: 'Bottom right corner', 5: 'Centre of the goal', 6: 'High and wide', 7: 'Hits the bar', 8: 'Misses to the left', 9: 'Misses to the right', 10: 'Too high', 11: 'Top centre of the goal', 12: 'Top left corner', 13: 'Top right corner'}
shot_outcome
{1: 'On target', 2: 'Off target', 3: 'Blocked', 4: 'Hit the bar'}
location
{1: 'Attacking half', 2: 'Defensive half', 3: 'Centre of the box', 4: 'Left wing', 5: 'Right wing', 6: 'Difficult angle and long range', 7: 'Difficult angle on the left', 8: 'Difficult angle on the right', 9: 'Left side of the box', 10: 'Left side of the six yard box', 11: 'Right 

In [12]:
for column_name, mapping in event_dict.items():
    mapping = {int(ev): ev_name for ev, ev_name in mapping.items()}
    combined_data[column_name] = combined_data[column_name].fillna(-1).astype(int)
    combined_data[column_name] = combined_data[column_name].map(mapping)
    combined_data[column_name] = combined_data[column_name].replace(-1, np.nan)

In [13]:
combined_data.describe(include="all")

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,player,player2,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break,date,league,season,country,ht,at,fthg,ftag
count,941009,941009,941009.0,941009.0,941009,941009,214293,941009,941009,941009,880009,291310,51715,51738,227459,228498,941009.0,467067,229185,941009.0,229137,941009.0,941009,941009,941009.0,941009,941009,941009,941009.0,941009.0
unique,9074,941009,,,366076,11,4,2,142,142,6118,5747,5093,4671,13,4,,19,3,5.0,4,,923,5,,5,142,142,,
top,ITbfCc8F/,UFot0hit1,,,Foul by Perparim Hetemaj (Chievo).,Free kick,Key Pass,Home,Juventus,Genoa,cristiano ronaldo,gonzalo castro,adrian,gonzalo castro,Blocked,Off target,,Defensive half,right foot,,Open play,,2015-04-04,I1,,italy,Genoa,Genoa,,
freq,180,1,,,396,237932,167859,488224,12054,11981,1872,641,147,103,54082,92827,,125137,121939,773104.0,193747,,3454,227127,,227127,11676,11779,,
mean,,,53.858826,49.663663,,,,,,,,,,,,,0.025978,,,,,0.004876,,,2014.405073,,,,1.54314,1.158935
std,,,32.014268,26.488977,,,,,,,,,,,,,0.159071,,,,,0.069655,,,1.59443,,,,1.296907,1.136129
min,,,1.0,0.0,,,,,,,,,,,,,0.0,,,,,0.0,,,2012.0,,,,0.0,0.0
25%,,,27.0,27.0,,,,,,,,,,,,,0.0,,,,,0.0,,,2013.0,,,,1.0,0.0
50%,,,53.0,51.0,,,,,,,,,,,,,0.0,,,,,0.0,,,2014.0,,,,1.0,1.0
75%,,,79.0,73.0,,,,,,,,,,,,,0.0,,,,,0.0,,,2016.0,,,,2.0,2.0


In [14]:
value_counts = combined_data["text"].value_counts()
filtered_ixs = value_counts[value_counts < 2].index

In [15]:
filtered_combined_data = combined_data[combined_data["text"].isin(filtered_ixs)]

In [16]:
filtered_combined_data.shape

(279786, 30)

In [17]:
# If shot_place column is not na, and event_type column is equal to 'Foul', then drop those rows from the dataframe
filtered_combined_data = filtered_combined_data[
    (filtered_combined_data["shot_place"].isna()) | (filtered_combined_data["event_type"] != "Foul")
]

In [18]:
# Remove rows with minor events like 'Corner', 'Foul', 'Hand Ball', 'Substitution', 'Yellow card', 'Second yellow card' from 'event_type' column
filtered_combined_data = filtered_combined_data[
    ~filtered_combined_data["event_type"].isin(
        ["Corner", "Foul", "Hand Ball", "Substitution", "Yellow card", "Second yellow card"]
    )
]

In [19]:
filtered_combined_data.shape

(225187, 30)

In [20]:
filtered_combined_data["event_type"].value_counts()

event_type
Attempt      198472
Offside       17590
Free kick      4804
Hand ball      1745
Penalty        1588
Red card        988
Name: count, dtype: int64

In [21]:
filtered_combined_data.to_csv(Path("..", "resources", "filtered_combined_data.csv"), index=False)

In [22]:
filtered_combined_data.head()

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,player,player2,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break,date,league,season,country,ht,at,fthg,ftag
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,Attempt,Key Pass,Away,Hamburg SV,Borussia Dortmund,mladen petric,gokhan tore,,,High and wide,Off target,0,Left side of the box,left foot,Pass,Open play,0,2011-08-05,D1,2012,germany,Borussia Dortmund,Hamburg SV,3,1
11,UFot0hit/,UFot0hit12,12,14,Attempt missed. Shinji Kagawa (Borussia Dortmu...,Attempt,Key Pass,Home,Borussia Dortmund,Hamburg SV,shinji kagawa,mario gotze,,,Top right corner,Off target,0,Outside the box,right foot,Pass,Open play,0,2011-08-05,D1,2012,germany,Borussia Dortmund,Hamburg SV,3,1
13,UFot0hit/,UFot0hit14,14,17,"Goal! Borussia Dortmund 1, Hamburg 0. Kevin G...",Attempt,Key Pass,Home,Borussia Dortmund,Hamburg SV,kevin grosskreutz,mario gotze,,,Bottom right corner,On target,1,Left side of the box,left foot,Pass,Open play,0,2011-08-05,D1,2012,germany,Borussia Dortmund,Hamburg SV,3,1
14,UFot0hit/,UFot0hit15,15,19,Attempt blocked. Mats Hummels (Borussia Dortmu...,Attempt,,Home,Borussia Dortmund,Hamburg SV,mats hummels,,,,Blocked,Blocked,0,Outside the box,right foot,,Open play,0,2011-08-05,D1,2012,germany,Borussia Dortmund,Hamburg SV,3,1
18,UFot0hit/,UFot0hit19,19,25,Shinji Kagawa (Borussia Dortmund) hits the rig...,Attempt,Key Pass,Home,Borussia Dortmund,Hamburg SV,shinji kagawa,lukasz piszczek,,,Hits the bar,Hit the bar,0,Centre of the box,right foot,Pass,Open play,0,2011-08-05,D1,2012,germany,Borussia Dortmund,Hamburg SV,3,1


In [23]:
def generate_event_string(row: pd.Series) -> str:
    event_str = f"On minute {row['time']},"
    event_str += (
        f" the game is being played on {row['location']} in the field, and"
        if not pd.isna(row["location"]) or row["location"] != "Not recorded"
        else ""
    )
    event_str += f" {row['event_type']} happens" + (
        f" accompanied with {row['event_type2']}." if not pd.isna(row["event_type2"]) else "."
    )
    if row["is_goal"] == 1:
        event_str += f" This resulted in a goal for {row['event_team']}"
        event_str += (
            f", which was accompanied by an assist via {row['assist_method']}."
            if not pd.isna(row["assist_method"]) or row["assist_method"] != "None"
            else "."
        )

    if row["event_type"] != "Substitution":
        event_str += f" The primary player involved in the event was {row['player']}" + (
            f" who used his {row['bodypart']} for the shot" if not pd.isna(row["bodypart"]) else ""
        )
        event_str += (
            f" with the secondary player involved being {row['player2']}. " if not pd.isna(row["player2"]) else "."
        )
    else:
        event_str += f" The player {row['player_out']} is substituted" + (
            f" by {row['player_in']}. " if not pd.isna(row["player_in"]) else "."
        )
    if not pd.isna(row["shot_place"]):
        if row["shot_outcome"] == "Hit the bar":
            event_str += " The shot had hit the bar."
        elif row["shot_outcome"] == "Blocked":
            event_str += " The shot was blocked by the opponent team."
        elif row["shot_outcome"] == "On target":
            event_str += f" The shot was on target and was placed at the {row['shot_place']} of the goal."
        else:
            event_str += f" The shot was off target and was flying {row['shot_place']}."

    return event_str

In [24]:
training_sys_role = "You are a football commentator, who summarizes the major events of a game. "
training_sys_role += "You are given the contexts for the game, as well as the final results of the game. "
training_sys_role += (
    "But do not use the context or the results in your final summary of the game, just summarize the events."
)

In [25]:
grouped_data = filtered_combined_data.groupby("id_odsp", sort=False)
json_objects = []
for id_odsp, group_data in grouped_data:
    sorted_group = group_data.sort_values(by="time")
    first_row = sorted_group.iloc[0]

    context_str = f"It is a football match in the {first_row['season']} season of the first division league of {first_row['country']}. "
    context_str += f"The match is played in the home turf of {first_row['ht']} on {first_row['date']}, between home team {first_row['ht']} and away team {first_row['at']}."
    result_str = f"The final result of the game is {first_row['ftag']} goals for {first_row['at']} vs {first_row['fthg']} goals for {first_row['ht']}. "
    result_str += (
        f"The winner is the away team {first_row['at']}."
        if first_row["fthg"] < first_row["ftag"]
        else (
            f"The winner is the home team {first_row['ht']}."
            if first_row["fthg"] > first_row["ftag"]
            else "The match ended in a draw."
        )
    )
    negative_prompt = "I would again remind you not to use the context or the results in your summary of the game, rather just keep it limited to describing only the minute-by-minute events that happened in the game."

    events = list()
    commentaries = list()
    for _, row in sorted_group.iterrows():
        event_str = generate_event_string(row)
        events.append(event_str)
        commentaries.append(row["text"])

    json_object = {
        "messages": [
            {"role": "system", "content": training_sys_role},
            {"role": "user", "content": " ".join([context_str, " ".join(events), result_str, negative_prompt])},
            {"role": "assistant", "content": " ".join(commentaries)},
        ]
    }
    json_objects.append(json_object)

In [26]:
len(json_objects)

9074

In [27]:
json_objects[0]

{'messages': [{'role': 'system',
   'content': 'You are a football commentator, who summarizes the major events of a game. You are given the contexts for the game, as well as the final results of the game. But do not use the context or the results in your final summary of the game, just summarize the events.'},
  {'role': 'user',
   'content': 'It is a football match in the 2012 season of the first division league of germany. The match is played in the home turf of Borussia Dortmund on 2011-08-05, between home team Borussia Dortmund and away team Hamburg SV. On minute 2, the game is being played on Left side of the box in the field, and Attempt happens accompanied with Key Pass. The primary player involved in the event was mladen petric who used his left foot for the shot with the secondary player involved being gokhan tore.  The shot was off target and was flying High and wide. On minute 14, the game is being played on Outside the box in the field, and Attempt happens accompanied with

In [28]:
fine_tune_json = Path("..", "resources", "football_fine_tune.jsonl")
with open(fine_tune_json, "w") as file:
    for obj in json_objects[:7000]:
        file.write(json.dumps(obj) + "\n")

In [29]:
fine_tune_json = Path("..", "resources", "football_fine_tune_validation.jsonl")
with open(fine_tune_json, "w") as file:
    for obj in json_objects[7000:]:
        file.write(json.dumps(obj) + "\n")