In [None]:
# packages
from datetime import datetime
import os
import pandas as pd
from pathlib import Path
import concurrent.futures
import json
import warnings
import inflection


def read_and_parse_file(file_path, start_index, end_index):
    with open(file_path, "r") as file:
        js_object = file.read()
    data = js_object.strip()[start_index:-end_index]
    json_data = json.loads(data)
    return json_data


def read_json_files(folder_path, data_type, start_index, end_index):
    json_data_list = []
    file_paths = list(Path(folder_path).rglob(f"*{data_type}.js"))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(read_and_parse_file, path, start_index, end_index)
            for path in file_paths
        ]

        for future in concurrent.futures.as_completed(futures):
            json_data = future.result()
            json_data_list.append(json_data)

    return json_data_list


def read_csv_files(folder_path):
    dataframes = []
    file_paths = list(Path(folder_path).rglob("*.csv"))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(pd.read_csv, path, dtype={"column20": str, "column21": str})
            for path in file_paths
        ]

        for future in concurrent.futures.as_completed(futures):
            df = future.result()
            dataframes.append(df)

    final_dataframe = pd.concat(dataframes, ignore_index=True)
    return final_dataframe


def combine_squad_data(json_list):
    comined_json_list = []

    for json_obj in json_list:
        squad_a = json_obj.get("squadA", [])
        squad_b = json_obj.get("squadB", [])

        divided_json_1 = {"squad": squad_a}
        divided_json_2 = {"squad": squad_b}

        comined_json_list.append(divided_json_1)
        comined_json_list.append(divided_json_2)

    return comined_json_list


def combine_innings_data(innings_1_json_data_list, innings_2_json_data_list):
    combined_data = []

    # Append the JSON objects from the first list
    for innings_1_json_data in innings_1_json_data_list:
        combined_data.append(innings_1_json_data["Innings1"])

    # Append the JSON objects from the second list
    for innings_2_json_data in innings_2_json_data_list:
        combined_data.append(innings_2_json_data["Innings2"])

    return combined_data


def convert_json_list_to_df(json_data_list, key_name):
    dataframes = []
    for json_data in json_data_list:
        df = pd.DataFrame(json_data[key_name])
        dataframes.append(df)
    final_dataframe = pd.concat(dataframes, ignore_index=True)

    return final_dataframe

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

data_path = os.getenv("data_path")

data_feeds_path = f"{data_path}/Data_Feeds"
squad_feeds_path = f"{data_path}/Squad_Feeds"
other_tournament_data_path = f"{data_path}/other-tournament-data"

# file name of collected data
match_schedule_file_name = "matchSchedule"
match_squad_file_name = "squad"
match_summary_file_name = "matchsummary"
match_inning1_file_name = "Innings1"
match_inning2_file_name = "Innings2"

# key value of relevant data in json objects of collected data
match_result_key = "Result"
match_player_key = "squad"
match_summary_key = "MatchSummary"
match_batting_card_key = "BattingCard"
match_extras_key = "Extras"
match_fall_of_wickets_key = "FallOfWickets"
match_wagon_wheel_key = "WagonWheel"
match_partnership_scores_key = "PartnershipScores"
match_partnership_break_key = "PartnershipBreak"
match_bowling_card_key = "BowlingCard"
match_manhattan_graph_key = "ManhattanGraph"
match_manhattan_wickets_key = "ManhattanWickets"
match_over_history_key = "OverHistory"
match_wagon_wheel_summary_key = "WagonWheelSummary"
match_batting_head_to_head_key = "battingheadtohead"
match_bowling_head_to_head_key = "battingheadtohead"

In [None]:
def load_match_result_data():
    match_schedule_json_data_list = read_json_files(
        data_feeds_path, match_schedule_file_name, 14, 2
    )
    return convert_json_list_to_df(match_schedule_json_data_list, match_result_key)


def load_match_player_data():
    match_squad_json_data_list = read_json_files(
        squad_feeds_path, match_squad_file_name, 8, 2
    )
    match_player_json_data_list = combine_squad_data(match_squad_json_data_list)
    return convert_json_list_to_df(match_player_json_data_list, match_player_key)


def load_match_summary_data():
    match_summary_json_data_list = read_json_files(
        data_feeds_path, match_summary_file_name, 22, 2
    )
    return convert_json_list_to_df(match_summary_json_data_list, match_summary_key)


def load_match_innings_data():
    innings_1_json_data_list = read_json_files(
        data_feeds_path, match_inning1_file_name, 10, 2
    )
    innings_2_json_data_list = read_json_files(
        data_feeds_path, match_inning2_file_name, 10, 2
    )
    innings_json_data_list = combine_innings_data(
        innings_1_json_data_list, innings_2_json_data_list
    )
    match_batting_card = convert_json_list_to_df(
        innings_json_data_list, match_batting_card_key
    )
    match_extras = convert_json_list_to_df(innings_json_data_list, match_extras_key)
    match_fall_of_wickets = convert_json_list_to_df(
        innings_json_data_list, match_fall_of_wickets_key
    )
    match_wagon_wheel = convert_json_list_to_df(
        innings_json_data_list, match_wagon_wheel_key
    )
    match_partnership_scores = convert_json_list_to_df(
        innings_json_data_list, match_partnership_scores_key
    )
    match_partnership_break = convert_json_list_to_df(
        innings_json_data_list, match_partnership_break_key
    )
    match_bowling_card = convert_json_list_to_df(
        innings_json_data_list, match_bowling_card_key
    )
    match_manhattan_graph = convert_json_list_to_df(
        innings_json_data_list, match_manhattan_graph_key
    )
    match_manhattan_wickets = convert_json_list_to_df(
        innings_json_data_list, match_manhattan_wickets_key
    )
    match_over_history = convert_json_list_to_df(
        innings_json_data_list, match_over_history_key
    )
    match_wagon_wheel_summary = convert_json_list_to_df(
        innings_json_data_list, match_wagon_wheel_summary_key
    )
    match_batting_head_to_head = convert_json_list_to_df(
        innings_json_data_list, match_batting_head_to_head_key
    )
    match_bowling_head_to_head = convert_json_list_to_df(
        innings_json_data_list, match_bowling_head_to_head_key
    )

    dataframes = {
        "match_batting_card": match_batting_card,
        "match_extras": match_extras,
        "match_fall_of_wickets": match_fall_of_wickets,
        "match_wagon_wheel": match_wagon_wheel,
        "match_partnership_scores": match_partnership_scores,
        "match_partnership_break": match_partnership_break,
        "match_bowling_card": match_bowling_card,
        "match_manhattan_graph": match_manhattan_graph,
        "match_manhattan_wickets": match_manhattan_wickets,
        "match_over_history": match_over_history,
        "match_wagon_wheel_summary": match_wagon_wheel_summary,
        "match_batting_head_to_head": match_batting_head_to_head,
        "match_bowling_head_to_head": match_bowling_head_to_head,
    }

    return dataframes


def load_match_ball_data():
    match_balls = read_csv_files(other_tournament_data_path)
    return match_balls

In [35]:
match_result_data = load_match_result_data()
print(f"match results: {match_result_data.shape}")
print("Match Results Schema:")
print(match_result_data.info())

match_summary_data = load_match_summary_data()
print(f"match summaries: {match_summary_data.shape}")
print("Match Summaries Schema:")
print(match_summary_data.info())

match_player_data = load_match_player_data()
print(f"match players: {match_player_data.shape}")
print("Match Players Schema:")
print(match_player_data.info())

match_ball_data = load_match_ball_data()
print(f"match balls: {match_ball_data.shape}")
print("Match Balls Schema:")
print(match_ball_data.info())

innings_dataframes = load_match_innings_data()

for name, dataframe in innings_dataframes.items():
    print(f"\n{name} DataFrame:")
    print("Shape:", dataframe.shape)
    print("Columns:", dataframe.info())


match_batting_card DataFrame:
Shape: (1320, 31)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1320 entries, 0 to 1319
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MatchID             1320 non-null   object 
 1   InningsNo           1320 non-null   object 
 2   TeamID              1320 non-null   object 
 3   PlayerID            1320 non-null   object 
 4   BatSpec             1320 non-null   object 
 5   BowlSpec            1320 non-null   object 
 6   PlayerName          1320 non-null   object 
 7   PlayerImage         1320 non-null   object 
 8   PlayingOrder        1320 non-null   int64  
 9   BowlerName          1320 non-null   object 
 10  OutDesc             1320 non-null   object 
 11  Runs                1320 non-null   object 
 12  Balls               1320 non-null   object 
 13  DotBalls            1320 non-null   object 
 14  DotBallPercentage   1320 non-null   object 
 15  DotBal

Table Specific Code


In [36]:
# utils

import inflection
import pandas as pd


def clean_match_summary_data(match_summary):
    match_summary.loc[
        match_summary.CompetitionName == "BIG BASH LEAGUE 2018-19", "CompetitionName"
    ] = "BBL 2018-19"


def create_winning_team_id_column(match_df):
    match_df["winning_team_id"] = ""

    for index, row in match_df.iterrows():
        comments = row["Comments"]
        first_batting_team_name = row["FirstBattingTeamName"]
        first_batting_team_id = row["FirstBattingTeamID"]
        second_batting_team_name = row["SecondBattingTeamName"]
        second_batting_team_id = row["SecondBattingTeamID"]

        if first_batting_team_name in comments:
            match_df.at[index, "winning_team_id"] = first_batting_team_id
        elif second_batting_team_name in comments:
            match_df.at[index, "winning_team_id"] = second_batting_team_id

    return match_df


def preapare_data(match_result, match_summary, match_player):
    match_result = create_winning_team_id_column(match_result)
    match_result["is_title"] = (match_result["MatchDateOrder"] == 1).astype(int)
    match_result["is_playoff"] = (
        match_result["MatchDateOrder"].isin([1, 2, 3, 4])
    ).astype(int)

    match_result.columns = [inflection.underscore(col) for col in match_result.columns]
    match_summary.columns = [
        inflection.underscore(col) for col in match_summary.columns
    ]
    match_player.columns = [inflection.underscore(col) for col in match_player.columns]

    return match_result, match_summary, match_player


def generate_match_df(match_summary, match_result):
    match_df = pd.merge(
        match_summary[
            [
                "match_id",
                "competition_name",
                "first_batting_team_id",
                "second_batting_team_id",
            ]
        ],
        match_result[["match_id", "is_title", "is_playoff", "winning_team_id"]],
        on="match_id",
    )

    match_df["season"] = match_df.competition_name.apply(lambda x: x.split(" ")[1][:4])
    match_df.competition_name = match_df.competition_name.apply(
        lambda x: x.split(" ")[0]
    )

    return match_df


def generate_teams_df(match_df, match_player):
    teams_data = []

    for team_id in set(match_df["first_batting_team_id"]).union(
        set(match_df["second_batting_team_id"])
    ):
        team_matches = match_df[
            (match_df["first_batting_team_id"] == team_id)
            | (match_df["second_batting_team_id"] == team_id)
        ]
        competitions = team_matches["competition_name"].unique().tolist()

        for competition in competitions:
            competition_matches = team_matches[
                team_matches["competition_name"] == competition
            ]
            season_year_list = competition_matches["season"].unique().tolist()
            titles_count = len(
                competition_matches[
                    competition_matches["is_title"]
                    & (competition_matches["winning_team_id"] == team_id)
                ]
            )
            playoffs_count = len(
                competition_matches[competition_matches["is_playoff"] == 1]
            )

            team_data = {
                "team_id": team_id,
                "competition_name": competition,
                "seasons_played": season_year_list,
                "titles": titles_count,
                "playoffs": playoffs_count,
            }

            teams_data.append(team_data)

    teams_df = pd.DataFrame(teams_data)
    return teams_df

In [37]:
clean_match_summary_data(match_summary_data)

In [38]:
match_result, match_summary, match_player = preapare_data(
    match_result_data, match_summary_data, match_player_data
)

In [39]:
matches_df = generate_match_df(match_summary, match_result)
print(matches_df)

             match_id competition_name first_batting_team_id  \
0    3cc3a887a4a94483              BBL      e0d0177847684261   
1    3cc3a887a4a94483              BBL      e0d0177847684261   
2    5a664ab42159420d              BBL      2bb857b91be84183   
3    5a664ab42159420d              BBL      2bb857b91be84183   
4    295adc31d09c479c              BBL      cd2b5040a5614e96   
..                ...              ...                   ...   
115  b1469da9a8974bd1              BBL      cd2b5040a5614e96   
116  1b4aff6e09684b7a              BBL      cd2b5040a5614e96   
117  1b4aff6e09684b7a              BBL      cd2b5040a5614e96   
118  59403fd66de4416c              BBL      276819a42e064cf4   
119  59403fd66de4416c              BBL      276819a42e064cf4   

    second_batting_team_id  is_title  is_playoff   winning_team_id season  
0         276819a42e064cf4         0           0  e0d0177847684261   2018  
1         276819a42e064cf4         0           0  e0d0177847684261   2018  
2  

In [40]:
teams_df = generate_teams_df(matches_df, match_player)
print(teams_df)

            team_id competition_name seasons_played  titles  playoffs
0  a4d43ed3eb074b4c              BBL         [2018]       0         0
1  e0d0177847684261              BBL         [2018]       0         0
2  ed501d93fa364841              BBL         [2018]       0         0
3  47833ee7613144ec              BBL         [2018]       0         4
4  2bb857b91be84183              BBL         [2018]       0         6
5  cd2b5040a5614e96              BBL         [2018]       0         2
6  276819a42e064cf4              BBL         [2018]       0         0
7  2a8fb27ae8f542db              BBL         [2018]       2         4


In [41]:
match_player = match_player_data.loc[
    :, ["team_id", "team_name", "team_code", "team_image"]
]
# Rename columns as specified
match_player = match_player.rename(
    columns={
        "team_code": "team_short_name",
        "team_image": "team_image_url",
    }
)
print(match_player)

               team_id        team_name team_short_name team_image_url
0     2bb857b91be84183  Melbourne Stars           STARS               
1     2bb857b91be84183  Melbourne Stars           STARS               
2     2bb857b91be84183  Melbourne Stars           STARS               
3     2bb857b91be84183  Melbourne Stars           STARS               
4     2bb857b91be84183  Melbourne Stars           STARS               
...                ...              ...             ...            ...
1330  ed501d93fa364841  Perth Scorchers            SCOR               
1331  ed501d93fa364841  Perth Scorchers            SCOR               
1332  ed501d93fa364841  Perth Scorchers            SCOR               
1333  ed501d93fa364841  Perth Scorchers            SCOR               
1334  ed501d93fa364841  Perth Scorchers            SCOR               

[1335 rows x 4 columns]


In [42]:
# Merge the DataFrames based on 'team_id'
teams_df = teams_df.merge(
    match_player[["team_id", "team_name", "team_short_name", "team_image_url"]],
    on="team_id",
    how="inner",
)

In [47]:
teams_df.drop_duplicates(subset="team_id", inplace=True)

# Print the updated DataFrame
print(teams_df)

               team_id competition_name seasons_played  titles  playoffs  \
0     a4d43ed3eb074b4c              BBL         [2018]       0         0   
168   e0d0177847684261              BBL         [2018]       0         0   
333   ed501d93fa364841              BBL         [2018]       0         0   
489   47833ee7613144ec              BBL         [2018]       0         4   
657   2bb857b91be84183              BBL         [2018]       0         6   
835   cd2b5040a5614e96              BBL         [2018]       0         2   
1001  276819a42e064cf4              BBL         [2018]       0         0   
1157  2a8fb27ae8f542db              BBL         [2018]       2         4   

                team_name team_short_name team_image_url  
0           Brisbane Heat            HEAT                 
168     Adelaide Strikers             STR                 
333       Perth Scorchers            SCOR                 
489         Sydney Sixers             SIX                 
657       Melbourne 

In [48]:
teams_df = teams_df.rename(columns={"team_id": "src_team_id"})

In [49]:
print(teams_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 0 to 1157
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   src_team_id       8 non-null      object
 1   competition_name  8 non-null      object
 2   seasons_played    8 non-null      object
 3   titles            8 non-null      int64 
 4   playoffs          8 non-null      int64 
 5   team_name         8 non-null      object
 6   team_short_name   8 non-null      object
 7   team_image_url    8 non-null      object
dtypes: int64(2), object(6)
memory usage: 576.0+ bytes
None


In [50]:
import datetime

# Get the current timestamp
load_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Define the existing teams count and the current session's team count
existing_teams_count = 50
current_session_teams_count = len(teams_df)

# Generate team IDs based on the existing and current session's team count
team_ids = range(
    existing_teams_count, existing_teams_count + current_session_teams_count
)

# Update the teams_df DataFrame with the new columns
teams_df["team_id"] = team_ids
teams_df["load_timestamp"] = load_timestamp

# Print the updated teams_df DataFrame
print(teams_df)

           src_team_id competition_name seasons_played  titles  playoffs  \
0     a4d43ed3eb074b4c              BBL         [2018]       0         0   
168   e0d0177847684261              BBL         [2018]       0         0   
333   ed501d93fa364841              BBL         [2018]       0         0   
489   47833ee7613144ec              BBL         [2018]       0         4   
657   2bb857b91be84183              BBL         [2018]       0         6   
835   cd2b5040a5614e96              BBL         [2018]       0         2   
1001  276819a42e064cf4              BBL         [2018]       0         0   
1157  2a8fb27ae8f542db              BBL         [2018]       2         4   

                team_name team_short_name team_image_url  team_id  \
0           Brisbane Heat            HEAT                      50   
168     Adelaide Strikers             STR                      51   
333       Perth Scorchers            SCOR                      52   
489         Sydney Sixers             S