In [None]:
# packages
from datetime import datetime
import os
import pandas as pd
from pathlib import Path
import concurrent.futures
import json
import warnings
import inflection


In [None]:
# config
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)


In [None]:
# utilities

####
def read_and_parse_file(file_path, start_index, end_index):
    with open(file_path, "r") as file:
        js_object = file.read()
    data = js_object.strip()[start_index:-end_index]
    json_data = json.loads(data)
    return json_data


def read_json_files(folder_path, data_type, start_index, end_index):
    json_data_list = []
    file_paths = list(Path(folder_path).rglob(f"*{data_type}.js"))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(
            read_and_parse_file, path, start_index, end_index) for path in file_paths]

        for future in concurrent.futures.as_completed(futures):
            json_data = future.result()
            json_data_list.append(json_data)

    return json_data_list


def read_csv_files(folder_path):
    dataframes = []
    file_paths = list(Path(folder_path).rglob("*.csv"))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(pd.read_csv, path) for path in file_paths]

        for future in concurrent.futures.as_completed(futures):
            df = future.result()
            dataframes.append(df)

    final_dataframe = pd.concat(dataframes, ignore_index=True)
    return final_dataframe


def combine_squad_data(json_list):
    comined_json_list = []

    for json_obj in json_list:
        squad_a = json_obj.get("squadA", [])
        squad_b = json_obj.get("squadB", [])

        divided_json_1 = {"squad": squad_a}
        divided_json_2 = {"squad": squad_b}

        comined_json_list.append(divided_json_1)
        comined_json_list.append(divided_json_2)

    return comined_json_list


def combine_innings_data(innings_1_json_data_list, innings_2_json_data_list):
    combined_data = []

    # Append the JSON objects from the first list
    for innings_1_json_data in innings_1_json_data_list:
        combined_data.append(innings_1_json_data["Innings1"])

    # Append the JSON objects from the second list
    for innings_2_json_data in innings_2_json_data_list:
        combined_data.append(innings_2_json_data["Innings2"])

    return combined_data


def convert_json_list_to_df(json_data_list, key_name):
    dataframes = []
    for json_data in json_data_list:
        df = pd.DataFrame(json_data[key_name])
        dataframes.append(df)
    final_dataframe = pd.concat(dataframes, ignore_index=True)

    return final_dataframe

####

###


def create_winning_team_id_column(match_df):
    match_df['winning_team_id'] = ''

    for index, row in match_df.iterrows():
        comments = row['Comments']
        first_batting_team_name = row['FirstBattingTeamName']
        first_batting_team_id = row['FirstBattingTeamID']
        second_batting_team_name = row['SecondBattingTeamName']
        second_batting_team_id = row['SecondBattingTeamID']

        if first_batting_team_name in comments:
            match_df.at[index, 'winning_team_id'] = first_batting_team_id
        elif second_batting_team_name in comments:
            match_df.at[index, 'winning_team_id'] = second_batting_team_id

    return match_df

###


def map_between_teamid_title_count(matches_df):
    final_matches_df = matches_df[(matches_df['is_final'] == 1)]

    team_ids = final_matches_df['winning_team_id'].unique()
    team_data = []

    for team_id in team_ids:
        team_matches = final_matches_df[final_matches_df['winning_team_id'] == team_id]
        num_finals_won = len(team_matches)
        team_data.append(
            {'team_id': team_id, 'num_finals_won': num_finals_won})

    teams_df = pd.DataFrame(team_data)
    return teams_df


def map_btwn_teamid_playoff_count(matches_df):

    teams = pd.concat([matches_df['FirstBattingTeamID'],
                      matches_df['SecondBattingTeamID']]).unique()
    teams_df = pd.DataFrame({'team_id': teams})

    teams_df['playoff_matches_played'] = 0

    for index, row in matches_df.iterrows():
        match_type = row['match_type']
        first_batting_team_id = row['FirstBattingTeamID']
        second_batting_team_id = row['SecondBattingTeamID']

        if match_type == 1 or match_type == 2:
            teams_df.loc[teams_df['team_id'] ==
                         first_batting_team_id, 'playoff_matches_played'] += 1
            teams_df.loc[teams_df['team_id'] ==
                         second_batting_team_id, 'playoff_matches_played'] += 1

    return teams_df


def extract_team_season_info(match_summary):
    team_season_dict = {}

    competition_names = match_summary.CompetitionName.map(
        lambda p: ' '.join(p.split(' ')[:-1]))
    season_numbers = match_summary.CompetitionName.map(
        lambda p: p.split(' ')[-1][:4])

    # Iterating over each row in the dataframe
    for index, row in match_summary.iterrows():
        first_team_id = row['FirstBattingTeamID']
        second_team_id = row['SecondBattingTeamID']
        competition_name = competition_names[index]
        season_number = season_numbers[index]

        # Updating team_season_dict for the first team
        if first_team_id not in team_season_dict:
            team_season_dict[first_team_id] = {}

        if competition_name not in team_season_dict[first_team_id]:
            team_season_dict[first_team_id][competition_name] = set()

        team_season_dict[first_team_id][competition_name].add(season_number)

        # Updating team_season_dict for the second team
        if second_team_id not in team_season_dict:
            team_season_dict[second_team_id] = {}

        if competition_name not in team_season_dict[second_team_id]:
            team_season_dict[second_team_id][competition_name] = set()

        team_season_dict[second_team_id][competition_name].add(season_number)

    return team_season_dict


def create_dataframe_from_dict(data_dict):
    records = []

    for team_id, team_data in data_dict.items():
        for competition_name, seasons in team_data.items():
            record = {'team_id': team_id,
                      'competition_name': competition_name,
                      'seasons': list(seasons)}
            records.append(record)

    df = pd.DataFrame(records)
    return df


In [None]:
# get match result data from matchSchedule files of data feeds ~0.0s
directory = './data/Data_Feeds/'
data_type = 'matchSchedule'
json_data_list = read_json_files(directory, data_type, 14, 2)
key_name = 'Result'
match_result = convert_json_list_to_df(json_data_list, key_name)
print(match_result.shape)
# print(match_result.info())


In [None]:
# get match player data from squad feeds ~3.5s
directory = './data/Squad_Feeds/'
data_type = 'squad'
json_data_list = read_json_files(directory, data_type, 8, 2)
json_data_list = combine_squad_data(json_data_list)

key_name = 'squad'
match_player = convert_json_list_to_df(json_data_list, key_name)
print(match_player.shape)
# print(match_player.info())


In [None]:
# get match balls data from other tournament data ~2s
directory = './data/other-tournaments-data/'
match_balls = read_csv_files(directory)

print(match_balls.shape)
# print(match_balls.info())


In [None]:
# get match summary data from matchSummary files of data feeds ~3.5s
directory = './data/Data_Feeds/'
data_type = 'matchSummary'
json_data_list = read_json_files(directory, data_type, 22, 2)
key_name = 'MatchSummary'
match_summary = convert_json_list_to_df(json_data_list, key_name)
print(match_summary.shape)
print(match_summary.info())


In [None]:
# # get other match data from innings1 and innings2 files ~65s
# innings_1_json_data_list = read_json_files(
#     './data/Data_Feeds/', 'innings1', 10, 2)
# innings_2_json_data_list = read_json_files(
#     './data/Data_Feeds/', 'innings2', 10, 2)

# innings_json_data_list = combine_innings_data(
#     innings_1_json_data_list, innings_2_json_data_list)

# match_batting_card = convert_json_list_to_df(
#     innings_json_data_list, 'BattingCard')
# match_extras = convert_json_list_to_df(innings_json_data_list, 'Extras')
# match_fall_of_wickets = convert_json_list_to_df(
#     innings_json_data_list, 'FallOfWickets')
# match_wagon_wheel = convert_json_list_to_df(
#     innings_json_data_list, 'WagonWheel')
# match_partnership_scores = convert_json_list_to_df(
#     innings_json_data_list, 'PartnershipScores')
# match_partnership_break = convert_json_list_to_df(
#     innings_json_data_list, 'PartnershipBreak')
# match_bowling_card = convert_json_list_to_df(
#     innings_json_data_list, 'BowlingCard')
# match_manhattan_graph = convert_json_list_to_df(
#     innings_json_data_list, 'ManhattanGraph')
# match_manhattan_wickets = convert_json_list_to_df(
#     innings_json_data_list, 'ManhattanWickets')
# match_over_history = convert_json_list_to_df(
#     innings_json_data_list, 'OverHistory')
# match_wagon_wheel_summary = convert_json_list_to_df(
#     innings_json_data_list, 'WagonWheelSummary')
# match_batting_htoh = convert_json_list_to_df(
#     innings_json_data_list, 'battingheadtohead')
# match_bowling_htoh = convert_json_list_to_df(
#     innings_json_data_list, 'battingheadtohead')


In [None]:
# add columns to data
match_result = create_winning_team_id_column(match_result)
print(match_result.winning_team_id)
match_result['is_title'] = (match_result['MatchDateOrder'] == 1).astype(int)
match_result['is_playoff'] = (
    match_result['MatchDateOrder'].isin([1, 2, 3, 4])).astype(int)


In [None]:
# update column names of data
match_result.columns = [inflection.underscore(
    col) for col in match_result.columns]
match_summary.columns = [inflection.underscore(
    col) for col in match_summary.columns]
match_player.columns = [inflection.underscore(
    col) for col in match_player.columns]

print(match_player.info())


In [None]:
# remove anomaly in data
match_summary.loc[match_summary.competition_name ==
                  'BIG BASH LEAGUE 2018-19', 'competition_name'] = 'BBL 2018-19'


In [None]:
# get temporary match df
match_df = pd.merge(match_summary[['match_id', 'competition_name', 'first_batting_team_id', 'second_batting_team_id']], match_result[[
    'match_id', 'is_title', 'is_playoff', 'winning_team_id']], on='match_id')
match_df['season'] = match_df.competition_name.apply(
    lambda x: x.split(' ')[1][:4])
match_df.competition_name = match_df.competition_name.apply(
    lambda x: x.split(' ')[0])
print(match_df)


In [None]:
# generate team df from temporary match df
teams_data = []

for team_id in set(match_df['first_batting_team_id']).union(set(match_df['second_batting_team_id'])):
    team_matches = match_df[(match_df['first_batting_team_id'] == team_id) | (
        match_df['second_batting_team_id'] == team_id)]
    competitions = team_matches['competition_name'].unique().tolist()

    for competition in competitions:
        competition_matches = team_matches[team_matches['competition_name'] == competition]
        season_year_list = competition_matches['season'].unique().tolist()
        titles_count = len(competition_matches[competition_matches['is_title'] & (
            competition_matches['winning_team_id'] == team_id)])
        playoffs_count = len(
            competition_matches[competition_matches['is_playoff'] == 1])

        team_data = {
            'team_id': team_id,
            'competition_name': competition,
            'seasons_played': season_year_list,
            'titles': titles_count,
            'playoffs': playoffs_count
        }

        teams_data.append(team_data)

teams_df = pd.DataFrame(teams_data)

print(teams_df)


In [None]:
teams_df1 = pd.merge(teams_df, match_player[[
                     'team_id', 'team_name', 'team_image', 'team_code']], on='team_id')
print(teams_df1)
