In [13]:
%load_ext autoreload
%autoreload 2

import sys
from datetime import datetime, timedelta

import requests
import torch
import torch.nn.functional as F
from tenacity import retry, stop_after_attempt, wait_fixed
from loguru import logger
from transformers import DistilBertTokenizer, DistilBertModel

sys.path.append('../src')
from youtube import get_video_metadata
from settings import YOUTUBE_DIR

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Query Matches

In [14]:
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
def query_opendota(sql, **kwargs):
    query = sql.format(**kwargs)
    logger.debug(query)
    query = query.replace('\n', ' ').replace('\t', ' ')
    query = ' '.join(word for word in query.split(' ') if word)
    r = requests.get('https://api.opendota.com/api/explorer', params=dict(sql=query))
    r.raise_for_status()
    result = r.json()
    rows = result['rows']
    return rows

In [15]:
query = '''
SELECT
    match_id,
    start_time,
    matches.leagueid,
    leagues.name as league,
    radiant_team_id,
    radiant_team.name as radiant_name,
    radiant_team.tag as radiant_tag,
    dire_team_id,
    dire_team.name as dire_name,
    dire_team.tag as dire_tag
FROM
    matches
    join teams as dire_team on matches.dire_team_id = dire_team.team_id
    join teams as radiant_team on matches.radiant_team_id = radiant_team.team_id
    join leagues on matches.leagueid = leagues.leagueid
WHERE
    start_time >= extract(epoch from timestamp '{start_time}')
    and start_time <= extract(epoch from timestamp '{end_time}')
ORDER BY
    start_time desc
LIMIT
    100
'''

In [16]:
query_opendota(query, start_time='04-07-2022', end_time='05-07-2022')[:3]

2022-08-04 12:18:46.157 | DEBUG    | __main__:query_opendota:4 - 
SELECT
    match_id,
    start_time,
    matches.leagueid,
    leagues.name as league,
    radiant_team_id,
    radiant_team.name as radiant_name,
    radiant_team.tag as radiant_tag,
    dire_team_id,
    dire_team.name as dire_name,
    dire_team.tag as dire_tag
FROM
    matches
    join teams as dire_team on matches.dire_team_id = dire_team.team_id
    join teams as radiant_team on matches.radiant_team_id = radiant_team.team_id
    join leagues on matches.leagueid = leagues.leagueid
WHERE
    start_time >= extract(epoch from timestamp '04-07-2022')
    and start_time <= extract(epoch from timestamp '05-07-2022')
ORDER BY
    start_time desc
LIMIT
    100



[{'match_id': 6558371729,
  'start_time': 1651880157,
  'leagueid': 14132,
  'league': 'BTC Joy Race',
  'radiant_team_id': 7121518,
  'radiant_name': 'Unknown Team',
  'radiant_tag': 'Unknown',
  'dire_team_id': 8254112,
  'dire_name': 'Omega Gaming',
  'dire_tag': 'Ωmega'},
 {'match_id': 6558357364,
  'start_time': 1651878656,
  'leagueid': 14166,
  'league': 'BTS Pro Series 11',
  'radiant_team_id': 8376426,
  'radiant_name': 'Wildcard Gaming',
  'radiant_tag': 'WC',
  'dire_team_id': 8604954,
  'dire_name': 'DogChamp',
  'dire_tag': 'Dog'},
 {'match_id': 6558318351,
  'start_time': 1651875582,
  'leagueid': 14166,
  'league': 'BTS Pro Series 11',
  'radiant_team_id': 8604954,
  'radiant_name': 'DogChamp',
  'radiant_tag': 'Dog',
  'dire_team_id': 8376426,
  'dire_name': 'Wildcard Gaming',
  'dire_tag': 'WC'}]

# Get Titles

In [17]:
with open(YOUTUBE_DIR / 'urls.txt', 'r') as fin:
    urls = [line.strip() for line in fin]
urls[:3], len(urls)

(['https://www.youtube.com/watch?v=G4UcPjn7YFQ',
  'https://www.youtube.com/watch?v=r4rbY6zliTQ',
  'https://www.youtube.com/watch?v=ieh5x4uz-OE'],
 1964)

In [18]:
video_id, metadata = get_video_metadata(urls[40])
title = metadata['fulltitle']
upload_date = metadata['upload_date']
upload_date = datetime.strptime(upload_date, '%Y%m%d')
title, upload_date

('RNG vs XTREME - MATCH for ARLINGTON MAJOR! - DPC 2022 CN TOUR 3 SUMMER Dota 2 Highlights',
 datetime.datetime(2022, 7, 8, 0, 0))

# Search

In [19]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Dota Digest Format

In [20]:
teams, comment, tournament = [t.strip() for t in title.split('-')]
tournament = tournament.replace(' Dota 2 Highlights', '')
teams, tournament

('RNG vs XTREME', 'DPC 2022 CN TOUR 3 SUMMER')

In [21]:
end_time = upload_date + timedelta(days=1)
start_time = upload_date - timedelta(days=2)
matches = query_opendota(
    query,
    start_time=datetime.strftime(start_time, '%m-%d-%Y'),
    end_time=datetime.strftime(end_time, '%m-%d-%Y')
)

2022-08-04 12:19:05.447 | DEBUG    | __main__:query_opendota:4 - 
SELECT
    match_id,
    start_time,
    matches.leagueid,
    leagues.name as league,
    radiant_team_id,
    radiant_team.name as radiant_name,
    radiant_team.tag as radiant_tag,
    dire_team_id,
    dire_team.name as dire_name,
    dire_team.tag as dire_tag
FROM
    matches
    join teams as dire_team on matches.dire_team_id = dire_team.team_id
    join teams as radiant_team on matches.radiant_team_id = radiant_team.team_id
    join leagues on matches.leagueid = leagues.leagueid
WHERE
    start_time >= extract(epoch from timestamp '07-06-2022')
    and start_time <= extract(epoch from timestamp '07-09-2022')
ORDER BY
    start_time desc
LIMIT
    100



## Tournament Only

In [22]:
tournaments = list(set(m['league'] for m in matches))
tournaments

['Ultras Dota Pro League ',
 'Destiny league',
 'DPC CN Division II Summer Tour - 2021/2022 presented by Perfect World Esports',
 'DPC NA Division II Summer Tour - 2021/2022 - ESL One Summer presented by Intel',
 'DPC SA Division I  Summer Tour – 2021/2022 by 4D Esports',
 'Chaos Realm League',
 'DPC SA Division II Summer Tour – 2021/2022 by 4D Esports',
 'DPC 2021-2022 Summer Tour (EEU) Division I by Beyond The Summit',
 'DPC CN Division I Summer Tour - 2021/2022 presented by Perfect World Esports',
 'DPC 2021-2022 Summer Tour (EEU) Division II by Beyond The Summit',
 'Nepal E-Sports Championship 2022',
 'Dota 2 Champions League 2021-2022 Season',
 'DPC SEA Division II Tour 3 - 2021/2022 by Beyond The Summit',
 'DPC SEA Division I Tour 3 - 2021/2022 by Beyond The Summit',
 'Titus Colosseum Cup',
 'Knights Arena']

In [23]:
inputs = tokenizer([tournament] + tournaments, return_tensors='pt', padding=True)
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state

In [47]:
with torch.no_grad():
    tournaments_embeddings = torch.mean(last_hidden_states, dim=1)
    similarities = F.cosine_similarity(tournaments_embeddings[0:1], tournaments_embeddings[1:], dim=1)
    idx_closest = torch.argmax(similarities)
    tournament_top_ids = torch.topk(similarities, k=5).indices
print()
for idx in tournament_top_ids:
    print(tournaments[idx])


DPC SA Division II Summer Tour – 2021/2022 by 4D Esports
DPC SA Division I  Summer Tour – 2021/2022 by 4D Esports
DPC SEA Division I Tour 3 - 2021/2022 by Beyond The Summit
DPC CN Division II Summer Tour - 2021/2022 presented by Perfect World Esports
Nepal E-Sports Championship 2022


In [48]:
tournaments[idx_closest]

'DPC SA Division II Summer Tour – 2021/2022 by 4D Esports'

## Full Title

In [49]:
formatted_title = f'{teams} | {tournament}'
formatted_title

'RNG vs XTREME | DPC 2022 CN TOUR 3 SUMMER'

In [50]:
title_candidates = dict()
for m in matches:    
    match_id = m['match_id']
    league = m['league']
    
    radiant_tag = m['radiant_tag']
    dire_tag = m['dire_tag']
    candidate_on_tag_1 = f'{radiant_tag} vs {dire_tag} | {league}'
    title_candidates[candidate_on_tag_1] = match_id
    candidate_on_tag_2 = f'{dire_tag} vs {radiant_tag} | {league}'
    title_candidates[candidate_on_tag_2] = match_id
    
    radiant_name = m['radiant_name']
    dire_name = m['dire_name']
    candidate_on_name_1 = f'{radiant_name} vs {dire_name} | {league}'
    title_candidates[candidate_on_name_1] = match_id
    candidate_on_name_2 = f'{dire_name} vs {radiant_name} | {league}'
    title_candidates[candidate_on_name_2] = match_id
title_candidates_keys = list(title_candidates.keys())

In [51]:
inputs = tokenizer([formatted_title] + title_candidates_keys, return_tensors='pt', padding=True)
outputs = model(**inputs)
hidden_state = outputs.last_hidden_state

In [52]:
with torch.no_grad():
    embeddings = torch.mean(hidden_state, dim=1)
    similarities = F.cosine_similarity(embeddings[0:1], embeddings[1:], dim=1)
    top_ids = torch.topk(similarities, k=5).indices

print(formatted_title)
print()
for idx in top_ids:
    print(title_candidates_keys[idx])

RNG vs XTREME | DPC 2022 CN TOUR 3 SUMMER

BOOM vs T1 | DPC SEA Division I Tour 3 - 2021/2022 by Beyond The Summit
T1 vs BOOM | DPC SEA Division I Tour 3 - 2021/2022 by Beyond The Summit
RNG vs XG | DPC CN Division I Summer Tour - 2021/2022 presented by Perfect World Esports
XG vs RNG | DPC CN Division I Summer Tour - 2021/2022 presented by Perfect World Esports
BOOM Esports vs T1 | DPC SEA Division I Tour 3 - 2021/2022 by Beyond The Summit


## Names

In [56]:
team_candidates = dict()
for m in matches:
    league = m['league']
    # Filter only tournament
    # if league != tournaments[idx_closest]:
    # if league not in ((tournaments[idx] for idx in tournament_top_ids)):
        # continue
    match_id = m['match_id']
    
    radiant_tag = m['radiant_tag']
    dire_tag = m['dire_tag']
    candidate_on_tag_1 = f'{radiant_tag} vs {dire_tag}'
    team_candidates[candidate_on_tag_1] = match_id
    candidate_on_tag_2 = f'{dire_tag} vs {radiant_tag}'
    team_candidates[candidate_on_tag_2] = match_id
    
    radiant_name = m['radiant_name']
    dire_name = m['dire_name']
    candidate_on_name_1 = f'{radiant_name} vs {dire_name}'
    team_candidates[candidate_on_name_1] = match_id
    candidate_on_name_2 = f'{dire_name} vs {radiant_name}'
    team_candidates[candidate_on_name_2] = match_id
team_candidates_keys = list(team_candidates.keys())

In [57]:
inputs = tokenizer([teams] + team_candidates_keys, return_tensors='pt', padding=True)
outputs = model(**inputs)
hidden_state = outputs.last_hidden_state

In [58]:
with torch.no_grad():
    embeddings = torch.mean(hidden_state, dim=1)
    similarities = F.cosine_similarity(embeddings[0:1], embeddings[1:], dim=1)
    top_ids = torch.topk(similarities, k=3).indices

print(formatted_title)
print()
for idx in top_ids:
    print(team_candidates_keys[idx])
    match_id = team_candidates[team_candidates_keys[idx]]
    for m in matches:
        if m['match_id'] == match_id:
            radiant_name = m['radiant_name']
            dire_name = m['dire_name']
            league = m['league']
            found = f'{radiant_name} vs {dire_name} | {league}'
            print(found)
            print()

RNG vs XTREME | DPC 2022 CN TOUR 3 SUMMER

RNG vs XG
Royal Never Give Up vs Xtreme  Gaming | DPC CN Division I Summer Tour - 2021/2022 presented by Perfect World Esports

XG vs RNG
Royal Never Give Up vs Xtreme  Gaming | DPC CN Division I Summer Tour - 2021/2022 presented by Perfect World Esports

SG esports vs beastcoast
beastcoast vs SG esports | DPC SA Division I  Summer Tour – 2021/2022 by 4D Esports



### Use all strategies
1. Teams + Tournament
2. Teams -> Tournament
3. Tournament -> Teams

In [61]:
hidden_state.shape

torch.Size([167, 13, 768])