In [None]:
#| hide
#| default_exp clean_datasets

# Clean datasets

> Clean raw datasets for the Shiny app.

In [None]:
#| export

import warnings
warnings.filterwarnings('ignore')

import json
import logging
import os
import requests
from glob import glob

import pandas as pd

# NOTE: Had to install the package with the following command for the import to work.
# python3 -m pip install -e '.[dev]'
from isl_2024.utils import *

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
try:
    # This will work when running as a script
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # This will work when running in a Jupyter notebook
    script_dir = os.getcwd()

parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir))
log_dir = os.path.join(parent_dir, 'logs')
data_dir = os.path.join(parent_dir, 'data')
clean_data_dir = os.path.join(parent_dir, 'data/clean')

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.exists(clean_data_dir):
    os.makedirs(clean_data_dir)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename=os.path.join(log_dir, 'clean_datasets.log'), filemode='a')

# Matches

In [None]:
#| export

with open(os.path.join(data_dir, 'matches.txt'), encoding='utf-8') as f:
    matches = json.loads(f.readlines()[-1])['matches']

df = []
for match in matches:
    df.append({
        'start_at': match['start_date'],
        'end_at': match['end_date'],
        'home_team': match['participants'][0]['name'],
        'away_team': match['participants'][1]['name'],
        'score': match['winning_margin'],
    })
df = pd.DataFrame(df)
df['start_at'] = pd.to_datetime(df['start_at'])
df['end_at'] = pd.to_datetime(df['end_at'])
df['match_id'] = df.index + 1
df = df[['match_id', 'start_at', 'end_at', 'home_team', 'away_team', 'score']]

df.to_csv(os.path.join(clean_data_dir, 'matches.csv'), index=False)

print(df.shape)
df.head()

(84, 6)


Unnamed: 0,match_id,start_at,end_at,home_team,away_team,score
0,1,2024-09-13 19:30:00+05:30,2024-09-13 21:30:00+05:30,Mohun Bagan Super Giant,Mumbai City FC,2 - 2
1,2,2024-09-14 17:00:00+05:30,2024-09-14 19:00:00+05:30,Odisha FC,Chennaiyin FC,2 - 3
2,3,2024-09-14 19:30:00+05:30,2024-09-14 21:30:00+05:30,Bengaluru FC,East Bengal FC,1 - 0
3,4,2024-09-15 19:30:00+05:30,2024-09-15 21:30:00+05:30,Kerala Blasters FC,Punjab FC,
4,5,2024-09-16 19:30:00+05:30,2024-09-16 21:30:00+05:30,Mohammedan SC,NorthEast United FC,


# Live stats

In [None]:
files = glob(os.path.join(data_dir, 'live_stats', '66796.txt'))
for file in files:
    with open(file, encoding='utf-8') as f:
        # stats = json.loads(f.readlines()[-1])
        lines = f.readlines()
        first = lines[0]
        last = lines[-1]

In [None]:
len(lines)

171

In [None]:
json.loads(lines[78])['teams'][0]['stats']['touches']

{'total': 279,
 'total_passes': 188,
 'good_passes': 154,
 'bad_passes': 34,
 'pass_accuracy_percentage': 82.0,
 'interceptions': 3,
 'blocks': 3,
 'tackles': 6,
 'successful_tackles': 6,
 'unsuccessful_tackles': 0,
 'successful_tackle_percentage': 1.0,
 'clearance': 10,
 'aerial_clearance': 3,
 'saves': 2,
 'cleansheet': 0,
 'take_on_total': 6,
 'take_on_successful': 2,
 'take_on_unsuccessful': 4,
 'successful_take_on_percentage': 0.0,
 'last_man_tackle_successful': 0,
 'last_man_tackle_unsuccessful': 0,
 'possession_lost': 55,
 'aerial_duel': {'total': 4, 'won': 1, 'lost': 3, 'percentage': 0.25},
 'ground_duel': {'total': 39, 'won': 22, 'lost': 17, 'percentage': 0.56},
 'pass_length': None}

In [None]:
json.loads(lines[20])['teams'][0]['players'][3]['touches']

{'total': 13,
 'total_passes': 12,
 'good_passes': 10,
 'bad_passes': 2,
 'successful_passes': None,
 'pass_accuracy_percentage': 83.0,
 'pass_compared_percentage': 20.0,
 'touch_compared_percentage': 15.000001,
 'interceptions': 0,
 'blocks': 0,
 'tackles': 0,
 'successful_tackles': 0,
 'unsuccessful_tackles': 0,
 'successful_tackle_percentage': 0.0,
 'clearance': 0,
 'aerial_clearance': 0,
 'saves': 0,
 'take_on_total': 0,
 'take_on_successful': 0,
 'take_on_unsuccessful': 0,
 'successful_take_on_percentage': 0.0,
 'last_man_tackle_successful': 0,
 'last_man_tackle_unsuccessful': 0,
 'interceptions_won': 0,
 'possession_lost': 2,
 'aerial_duel': {'total': 1, 'won': 0, 'lost': 1, 'percentage': 0.0},
 'ground_duel': {'total': 0, 'won': 0, 'lost': 0, 'percentage': 0.0},
 'pass_length': None}

In [None]:
json.loads(lines[-1])['teams'][0]['players'][3]['touches']

{'total': 51,
 'total_passes': 43,
 'good_passes': 36,
 'bad_passes': 7,
 'successful_passes': None,
 'pass_accuracy_percentage': 84.0,
 'pass_compared_percentage': 14.0,
 'touch_compared_percentage': 10.0,
 'interceptions': 1,
 'blocks': 0,
 'tackles': 1,
 'successful_tackles': 1,
 'unsuccessful_tackles': 0,
 'successful_tackle_percentage': 1.0,
 'clearance': 3,
 'aerial_clearance': 2,
 'saves': 0,
 'take_on_total': 0,
 'take_on_successful': 0,
 'take_on_unsuccessful': 0,
 'successful_take_on_percentage': 0.0,
 'last_man_tackle_successful': 0,
 'last_man_tackle_unsuccessful': 0,
 'interceptions_won': 1,
 'possession_lost': 7,
 'aerial_duel': {'total': 3, 'won': 1, 'lost': 2, 'percentage': 0.33},
 'ground_duel': {'total': 1, 'won': 1, 'lost': 0, 'percentage': 1.0},
 'pass_length': None}