In [146]:
import os
import sys
import json

import pandas as pd
import matplotlib.pyplot as plt

sys.path.append('../src')
from settings import YOUTUBE_DIR, TIMESTAMPS_DIR

In [2]:
with open(YOUTUBE_DIR / 'matches.jsonlines', 'r') as fin:
    lines = [json.loads(line) for line in fin]
df_matches = pd.DataFrame(lines)
df_matches['video_id'] = df_matches['video_url'].replace('https://www.youtube.com/watch\?v=', '', regex=True)
df_matches = df_matches.drop_duplicates()
df_matches = df_matches.sort_values('confidence', ascending=False)
df_matches[['match_id', 'video_id', 'radiant_name', 'dire_name', 'title', 'confidence']]

Unnamed: 0,match_id,video_id,radiant_name,dire_name,title,confidence
0,6679653510,G4UcPjn7YFQ,Talon,Polaris Esports,TALON vs POLARIS | ESL ONE MALAYSIA 2022,3
2,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,BOOM vs TALON | ESL ONE MALAYSIA 2022 SEA QUAL...,3
1,6678583793,r4rbY6zliTQ,Polaris Esports,T1,T1 vs POLARIS | ESL ONE MALAYSIA 2022 SEA QUAL...,2
3,6677043559,csurev0fmXI,BOOM Esports,Team SMG,BOOM vs SMG | ESL ONE MALAYSIA 2022 SEA QUALIF...,2
5,6676051545,cXA5Hw2boLA,Team Spirit,Team Secret,SECRET vs TEAM SPIRIT | RIYADH MASTERS 2022,2
7,6674755794,bzFzhRdT3cA,Royal Never Give Up,Team Spirit,TEAM SPIRIT vs RNG | RIYADH MASTERS 2022,2
9,6674755794,MANLkT4D0b8,Royal Never Give Up,Team Spirit,LIQUID vs RNG | RIYADH MASTERS 2022,2
4,6676393091,XTngObUROMo,PSG.LGD,Team Spirit,TEAM SPIRIT vs PSG LGD | RIYADH MASTERS 2022,1
6,6675726202,qIOA87h5Msc,PSG.LGD,OG,OG vs PSG LGD | RIYADH MASTERS 2022,1
8,6674445805,84XMNFPtL68,PSG.LGD,Nigma Galaxy,NIGMA vs PSG LGD | RIYADH MASTERS 2022,1


In [139]:
events = []
for file in os.listdir(TIMESTAMPS_DIR):
    if file == '.DS_Store':
        continue
    
    with open(TIMESTAMPS_DIR / file, 'r') as fin:
        video_id = file.split('.')[0]
        for line in fin:
            event = json.loads(line)
            event['video_id'] = video_id
            events.append(event)

df_timestamps = pd.DataFrame(events)

In [140]:
df_timestamps['int_frame_id'] = df_timestamps['frame_id'].astype(int)
df_timestamps.sort_values('int_frame_id', inplace=True)
df_timestamps = df_timestamps[pd.notnull(df_timestamps['timestamp'])].copy()
df_timestamps['timestamp'] = df_timestamps['timestamp'].astype(int)
df_timestamps

Unnamed: 0,video_id,frame_id,text,timestamp,int_frame_id
3897,wUPhtY4sNP0,001,3.55,235,1
3898,wUPhtY4sNP0,002,355,235,2
3899,wUPhtY4sNP0,003,3.56,236,3
3900,wUPhtY4sNP0,004,3.57,237,4
3901,wUPhtY4sNP0,005,3.58,238,5
...,...,...,...,...,...
3109,G4UcPjn7YFQ,2217,3317,1997,2217
3110,G4UcPjn7YFQ,2218,3317,1997,2218
3111,G4UcPjn7YFQ,2219,3317,1997,2219
3113,G4UcPjn7YFQ,2220,3317,1997,2220


In [152]:
df_timestamps['timestamp'].diff().describe()

count     3818.000000
mean         0.461498
std       1836.317649
min     -25168.000000
25%       -248.000000
50%          1.000000
75%        255.000000
max      24628.000000
Name: timestamp, dtype: float64

In [141]:
def dota_clock_format(seconds):
    minutes = str(seconds // 60).zfill(2)
    secs = str(seconds % 60).zfill(2)
    return f'{minutes}:{secs}'

In [142]:
from sklearn.cluster import DBSCAN

model = DBSCAN(eps=5, min_samples=5)
cluster = model.fit_predict(df_timestamps['timestamp'].to_numpy().reshape(-1, 1))
df_timestamps['cluster'] = cluster
df_timestamps['clock'] = df_timestamps['timestamp'].apply(dota_clock_format)

In [143]:
df_timestamps

Unnamed: 0,video_id,frame_id,text,timestamp,int_frame_id,cluster,clock
3897,wUPhtY4sNP0,001,3.55,235,1,0,03:55
3898,wUPhtY4sNP0,002,355,235,2,0,03:55
3899,wUPhtY4sNP0,003,3.56,236,3,0,03:56
3900,wUPhtY4sNP0,004,3.57,237,4,0,03:57
3901,wUPhtY4sNP0,005,3.58,238,5,0,03:58
...,...,...,...,...,...,...,...
3109,G4UcPjn7YFQ,2217,3317,1997,2217,12,33:17
3110,G4UcPjn7YFQ,2218,3317,1997,2218,12,33:17
3111,G4UcPjn7YFQ,2219,3317,1997,2219,12,33:17
3113,G4UcPjn7YFQ,2220,3317,1997,2220,12,33:17


In [144]:
df_timestamps[df_timestamps['cluster'] == -1]

Unnamed: 0,video_id,frame_id,text,timestamp,int_frame_id,cluster,clock
923,ieh5x4uz-OE,252,21440,12880,252,-1,214:40
927,ieh5x4uz-OE,256,214,134,256,-1,02:14
3563,G4UcPjn7YFQ,666,11224,6744,666,-1,112:24
1385,ieh5x4uz-OE,714,216,136,714,-1,02:16
1405,ieh5x4uz-OE,734,214,134,734,-1,02:14
3753,G4UcPjn7YFQ,856,216,136,856,-1,02:16
226,ieh5x4uz-OE,1115,- 442,-282,1115,-1,-5:18
230,ieh5x4uz-OE,1119,11212,6732,1119,-1,112:12
233,ieh5x4uz-OE,1121,11814,7094,1121,-1,118:14
2030,G4UcPjn7YFQ,1236,41410,24850,1236,-1,414:10


In [121]:
df_timestamps = df_timestamps[df_timestamps['cluster'] != -1].copy()

In [130]:
df_highlights = df_timestamps.groupby(['video_id', 'cluster']).agg({'clock': ['first', 'last']})
df_highlights.sort_values(['video_id', ('clock', 'first')])
df_highlights.reset_index(inplace=True)
df_highlights.columns = ['video_id', 'cluster', 'start', 'end']
df_highlights

Unnamed: 0,video_id,cluster,start,end
0,G4UcPjn7YFQ,0,03:27,03:52
1,G4UcPjn7YFQ,1,00:45,01:07
2,G4UcPjn7YFQ,2,07:34,08:34
3,G4UcPjn7YFQ,3,14:00,14:01
4,G4UcPjn7YFQ,4,09:52,10:23
5,G4UcPjn7YFQ,5,10:45,11:18
6,G4UcPjn7YFQ,6,17:54,20:38
7,G4UcPjn7YFQ,7,12:22,12:27
8,G4UcPjn7YFQ,8,21:22,24:30
9,G4UcPjn7YFQ,9,26:05,25:54


In [136]:
df = pd.merge(
    df_matches[['match_id', 'video_id', 'radiant_name', 'dire_name', 'confidence']], 
    df_highlights, 
    on=['video_id']
)
df = df.sort_values(['match_id', 'start'])
df

Unnamed: 0,match_id,video_id,radiant_name,dire_name,confidence,cluster,start,end
36,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,1,00:41,01:09
35,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,0,03:17,04:00
49,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,14,04:13,04:44
37,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,2,08:47,09:07
39,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,4,09:45,10:06
40,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,5,11:03,11:31
42,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,7,12:22,12:24
55,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,22,12:33,13:03
38,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,3,14:07,14:33
50,6678365022,ieh5x4uz-OE,BOOM Esports,Talon,3,16,15:24,15:35


# TODO

- 1 video contains 2-3 matches