In [43]:
# packages
from datetime import datetime
import os
import pandas as pd
from pathlib import Path
import concurrent.futures
import json
import warnings
import inflection


def read_and_parse_file(file_path, start_index, end_index):
    with open(file_path, "r") as file:
        js_object = file.read()
    data = js_object.strip()[start_index:-end_index]
    json_data = json.loads(data)
    return json_data


def read_json_files(folder_path, data_type, start_index, end_index):
    json_data_list = []
    file_paths = list(Path(folder_path).rglob(f"*{data_type}.js"))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(read_and_parse_file, path, start_index, end_index)
            for path in file_paths
        ]

        for future in concurrent.futures.as_completed(futures):
            json_data = future.result()
            json_data_list.append(json_data)

    return json_data_list


def read_csv_files(folder_path):
    dataframes = []
    file_paths = list(Path(folder_path).rglob("*.csv"))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(pd.read_csv, path, dtype={"column20": str, "column21": str})
            for path in file_paths
        ]

        for future in concurrent.futures.as_completed(futures):
            df = future.result()
            dataframes.append(df)

    final_dataframe = pd.concat(dataframes, ignore_index=True)
    return final_dataframe


def combine_squad_data(json_list):
    comined_json_list = []

    for json_obj in json_list:
        squad_a = json_obj.get("squadA", [])
        squad_b = json_obj.get("squadB", [])

        divided_json_1 = {"squad": squad_a}
        divided_json_2 = {"squad": squad_b}

        comined_json_list.append(divided_json_1)
        comined_json_list.append(divided_json_2)

    return comined_json_list


def combine_innings_data(innings_1_json_data_list, innings_2_json_data_list):
    combined_data = []

    # Append the JSON objects from the first list
    for innings_1_json_data in innings_1_json_data_list:
        combined_data.append(innings_1_json_data["Innings1"])

    # Append the JSON objects from the second list
    for innings_2_json_data in innings_2_json_data_list:
        combined_data.append(innings_2_json_data["Innings2"])

    return combined_data


def convert_json_list_to_df(json_data_list, key_name):
    dataframes = []
    for json_data in json_data_list:
        df = pd.DataFrame(json_data[key_name])
        dataframes.append(df)
    final_dataframe = pd.concat(dataframes, ignore_index=True)

    return final_dataframe

In [44]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

data_path = os.getenv("data_path")

data_feeds_path = f"{data_path}/Data_Feeds"
squad_feeds_path = f"{data_path}/Squad_Feeds"
other_tournament_data_path = f"{data_path}/other-tournament-data"

# file name of collected data
match_schedule_file_name = "matchSchedule"
match_squad_file_name = "squad"
match_summary_file_name = "matchsummary"
match_inning1_file_name = "Innings1"
match_inning2_file_name = "Innings2"

# key value of relevant data in json objects of collected data
match_result_key = "Result"
match_player_key = "squad"
match_summary_key = "MatchSummary"
match_batting_card_key = "BattingCard"
match_extras_key = "Extras"
match_fall_of_wickets_key = "FallOfWickets"
match_wagon_wheel_key = "WagonWheel"
match_partnership_scores_key = "PartnershipScores"
match_partnership_break_key = "PartnershipBreak"
match_bowling_card_key = "BowlingCard"
match_manhattan_graph_key = "ManhattanGraph"
match_manhattan_wickets_key = "ManhattanWickets"
match_over_history_key = "OverHistory"
match_wagon_wheel_summary_key = "WagonWheelSummary"
match_batting_head_to_head_key = "battingheadtohead"
match_bowling_head_to_head_key = "battingheadtohead"

In [45]:
def load_match_result_data():
    match_schedule_json_data_list = read_json_files(
        data_feeds_path, match_schedule_file_name, 14, 2
    )
    return convert_json_list_to_df(match_schedule_json_data_list, match_result_key)


def load_match_player_data():
    match_squad_json_data_list = read_json_files(
        squad_feeds_path, match_squad_file_name, 8, 2
    )
    match_player_json_data_list = combine_squad_data(match_squad_json_data_list)
    return convert_json_list_to_df(match_player_json_data_list, match_player_key)


def load_match_summary_data():
    match_summary_json_data_list = read_json_files(
        data_feeds_path, match_summary_file_name, 22, 2
    )
    return convert_json_list_to_df(match_summary_json_data_list, match_summary_key)


def load_match_innings_data():
    innings_1_json_data_list = read_json_files(
        data_feeds_path, match_inning1_file_name, 10, 2
    )
    innings_2_json_data_list = read_json_files(
        data_feeds_path, match_inning2_file_name, 10, 2
    )
    innings_json_data_list = combine_innings_data(
        innings_1_json_data_list, innings_2_json_data_list
    )
    match_batting_card = convert_json_list_to_df(
        innings_json_data_list, match_batting_card_key
    )
    match_extras = convert_json_list_to_df(innings_json_data_list, match_extras_key)
    match_fall_of_wickets = convert_json_list_to_df(
        innings_json_data_list, match_fall_of_wickets_key
    )
    match_wagon_wheel = convert_json_list_to_df(
        innings_json_data_list, match_wagon_wheel_key
    )
    match_partnership_scores = convert_json_list_to_df(
        innings_json_data_list, match_partnership_scores_key
    )
    match_partnership_break = convert_json_list_to_df(
        innings_json_data_list, match_partnership_break_key
    )
    match_bowling_card = convert_json_list_to_df(
        innings_json_data_list, match_bowling_card_key
    )
    match_manhattan_graph = convert_json_list_to_df(
        innings_json_data_list, match_manhattan_graph_key
    )
    match_manhattan_wickets = convert_json_list_to_df(
        innings_json_data_list, match_manhattan_wickets_key
    )
    match_over_history = convert_json_list_to_df(
        innings_json_data_list, match_over_history_key
    )
    match_wagon_wheel_summary = convert_json_list_to_df(
        innings_json_data_list, match_wagon_wheel_summary_key
    )
    match_batting_head_to_head = convert_json_list_to_df(
        innings_json_data_list, match_batting_head_to_head_key
    )
    match_bowling_head_to_head = convert_json_list_to_df(
        innings_json_data_list, match_bowling_head_to_head_key
    )

    dataframes = {
        "match_batting_card": match_batting_card,
        "match_extras": match_extras,
        "match_fall_of_wickets": match_fall_of_wickets,
        "match_wagon_wheel": match_wagon_wheel,
        "match_partnership_scores": match_partnership_scores,
        "match_partnership_break": match_partnership_break,
        "match_bowling_card": match_bowling_card,
        "match_manhattan_graph": match_manhattan_graph,
        "match_manhattan_wickets": match_manhattan_wickets,
        "match_over_history": match_over_history,
        "match_wagon_wheel_summary": match_wagon_wheel_summary,
        "match_batting_head_to_head": match_batting_head_to_head,
        "match_bowling_head_to_head": match_bowling_head_to_head,
    }

    return dataframes


def load_match_ball_data():
    match_balls = read_csv_files(other_tournament_data_path)
    return match_balls

In [46]:
match_result_data = load_match_result_data()
print(f"match results: {match_result_data.shape}")
print("Match Results Schema:")
print(match_result_data.info())

match_summary_data = load_match_summary_data()
print(f"match summaries: {match_summary_data.shape}")
print("Match Summaries Schema:")
print(match_summary_data.info())

match_player_data = load_match_player_data()
print(f"match players: {match_player_data.shape}")
print("Match Players Schema:")
print(match_player_data.info())

match_ball_data = load_match_ball_data()
print(f"match balls: {match_ball_data.shape}")
print("Match Balls Schema:")
print(match_ball_data.info())

innings_dataframes = load_match_innings_data()

for name, dataframe in innings_dataframes.items():
    print(f"\n{name} DataFrame:")
    print("Shape:", dataframe.shape)
    print("Columns:", dataframe.info())

match results: (118, 23)
Match Results Schema:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   CompetitionID          118 non-null    object
 1   MatchID                118 non-null    object
 2   MatchTypeID            118 non-null    object
 3   MatchType              118 non-null    object
 4   MatchStatus            118 non-null    object
 5   MatchName              118 non-null    object
 6   MatchDate              118 non-null    object
 7   MatchTime              118 non-null    object
 8   FirstBattingTeamID     118 non-null    object
 9   FirstBattingTeamName   118 non-null    object
 10  HomeTeamLogo           118 non-null    object
 11  SecondBattingTeamID    118 non-null    object
 12  SecondBattingTeamName  118 non-null    object
 13  AwayTeamLogo           118 non-null    object
 14  GroundID               118 

  result = self.fn(*self.args, **self.kwargs)


match balls: (70582, 22)
Match Balls Schema:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70582 entries, 0 to 70581
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_id                70582 non-null  int64  
 1   season                  70582 non-null  object 
 2   start_date              70582 non-null  object 
 3   venue                   70582 non-null  object 
 4   innings                 70582 non-null  int64  
 5   ball                    70582 non-null  float64
 6   batting_team            70582 non-null  object 
 7   bowling_team            70582 non-null  object 
 8   striker                 70582 non-null  object 
 9   non_striker             70582 non-null  object 
 10  bowler                  70582 non-null  object 
 11  runs_off_bat            70582 non-null  int64  
 12  extras                  70582 non-null  int64  
 13  wides                   2108 non-null   float6

Table Specific Code


In [47]:
# utils

import inflection
import pandas as pd


def clean_match_summary_data(match_summary):
    match_summary.loc[
        match_summary.CompetitionName == "BIG BASH LEAGUE 2018-19", "CompetitionName"
    ] = "BBL 2018-19"


def create_winning_team_id_column(match_df):
    match_df["winning_team_id"] = ""

    for index, row in match_df.iterrows():
        comments = row["Comments"]
        first_batting_team_name = row["FirstBattingTeamName"]
        first_batting_team_id = row["FirstBattingTeamID"]
        second_batting_team_name = row["SecondBattingTeamName"]
        second_batting_team_id = row["SecondBattingTeamID"]

        if first_batting_team_name in comments:
            match_df.at[index, "winning_team_id"] = first_batting_team_id
        elif second_batting_team_name in comments:
            match_df.at[index, "winning_team_id"] = second_batting_team_id

    return match_df


def preapare_data(match_result, match_summary, match_player):
    match_result = create_winning_team_id_column(match_result)
    match_result["is_title"] = (match_result["MatchDateOrder"] == 1).astype(int)
    match_result["is_playoff"] = (
        match_result["MatchDateOrder"].isin([1, 2, 3, 4])
    ).astype(int)

    match_result.columns = [inflection.underscore(col) for col in match_result.columns]
    match_summary.columns = [
        inflection.underscore(col) for col in match_summary.columns
    ]
    match_player.columns = [inflection.underscore(col) for col in match_player.columns]

    return match_result, match_summary, match_player


def generate_match_df(match_summary, match_result):
    match_df = pd.merge(
        match_summary[
            [
                "match_id",
                "competition_name",
                "first_batting_team_id",
                "second_batting_team_id",
            ]
        ],
        match_result[["match_id", "is_title", "is_playoff", "winning_team_id"]],
        on="match_id",
    )

    match_df["season"] = match_df.competition_name.apply(lambda x: x.split(" ")[1][:4])
    match_df.competition_name = match_df.competition_name.apply(
        lambda x: x.split(" ")[0]
    )

    return match_df


def generate_teams_df(match_df, match_player):
    teams_data = []

    for team_id in set(match_df["first_batting_team_id"]).union(
        set(match_df["second_batting_team_id"])
    ):
        team_matches = match_df[
            (match_df["first_batting_team_id"] == team_id)
            | (match_df["second_batting_team_id"] == team_id)
        ]
        competitions = team_matches["competition_name"].unique().tolist()

        for competition in competitions:
            competition_matches = team_matches[
                team_matches["competition_name"] == competition
            ]
            season_year_list = competition_matches["season"].unique().tolist()
            titles_count = len(
                competition_matches[
                    competition_matches["is_title"]
                    & (competition_matches["winning_team_id"] == team_id)
                ]
            )
            playoffs_count = len(
                competition_matches[competition_matches["is_playoff"] == 1]
            )

            team_data = {
                "team_id": team_id,
                "competition_name": competition,
                "seasons_played": season_year_list,
                "titles": titles_count,
                "playoffs": playoffs_count,
            }

            teams_data.append(team_data)

    teams_df = pd.DataFrame(teams_data)
    return teams_df

In [48]:
clean_match_summary_data(match_summary_data)

In [49]:
match_result, match_summary, match_player = preapare_data(
    match_result_data, match_summary_data, match_player_data
)

In [50]:
matches_df = generate_match_df(match_summary, match_result)
print(matches_df)

             match_id competition_name first_batting_team_id  \
0    eb0eaa5e6d3e4d89              BBL      ed501d93fa364841   
1    eb0eaa5e6d3e4d89              BBL      ed501d93fa364841   
2    0a9042558fef493e              BBL      276819a42e064cf4   
3    0a9042558fef493e              BBL      276819a42e064cf4   
4    0a9042558fef493e              BBL      276819a42e064cf4   
..                ...              ...                   ...   
115  cce62275aaea4320              BBL      ed501d93fa364841   
116  16f269b357464ba4              BBL      ed501d93fa364841   
117  16f269b357464ba4              BBL      ed501d93fa364841   
118  5838857566ff423a              BBL      a4d43ed3eb074b4c   
119  5838857566ff423a              BBL      a4d43ed3eb074b4c   

    second_batting_team_id  is_title  is_playoff   winning_team_id season  
0         276819a42e064cf4         0           0  276819a42e064cf4   2018  
1         276819a42e064cf4         0           0  276819a42e064cf4   2018  
2  

In [51]:
teams_df = generate_teams_df(matches_df, match_player)
print(teams_df)

            team_id competition_name seasons_played  titles  playoffs
0  2a8fb27ae8f542db              BBL         [2018]       2         4
1  a4d43ed3eb074b4c              BBL         [2018]       0         0
2  276819a42e064cf4              BBL         [2018]       0         0
3  e0d0177847684261              BBL         [2018]       0         0
4  47833ee7613144ec              BBL         [2018]       0         4
5  ed501d93fa364841              BBL         [2018]       0         0
6  2bb857b91be84183              BBL         [2018]       0         6
7  cd2b5040a5614e96              BBL         [2018]       0         2


In [52]:
match_player = match_player_data.loc[
    :, ["team_id", "team_name", "team_code", "team_image"]
]
# Rename columns as specified
match_player = match_player.rename(
    columns={
        "team_code": "team_short_name",
        "team_image": "team_image_url",
    }
)
print(match_player)

               team_id        team_name team_short_name team_image_url
0     47833ee7613144ec    Sydney Sixers             SIX               
1     47833ee7613144ec    Sydney Sixers             SIX               
2     47833ee7613144ec    Sydney Sixers             SIX               
3     47833ee7613144ec    Sydney Sixers             SIX               
4     47833ee7613144ec    Sydney Sixers             SIX               
...                ...              ...             ...            ...
1330  2bb857b91be84183  Melbourne Stars           STARS               
1331  2bb857b91be84183  Melbourne Stars           STARS               
1332  2bb857b91be84183  Melbourne Stars           STARS               
1333  2bb857b91be84183  Melbourne Stars           STARS               
1334  2bb857b91be84183  Melbourne Stars           STARS               

[1335 rows x 4 columns]


In [53]:
# Merge the DataFrames based on 'team_id'
teams_df = teams_df.merge(
    match_player[["team_id", "team_name", "team_short_name", "team_image_url"]],
    on="team_id",
    how="inner",
)

In [54]:
teams_df.drop_duplicates(subset="team_id", inplace=True)

# Print the updated DataFrame
print(teams_df)

               team_id competition_name seasons_played  titles  playoffs  \
0     2a8fb27ae8f542db              BBL         [2018]       2         4   
178   a4d43ed3eb074b4c              BBL         [2018]       0         0   
346   276819a42e064cf4              BBL         [2018]       0         0   
502   e0d0177847684261              BBL         [2018]       0         0   
667   47833ee7613144ec              BBL         [2018]       0         4   
835   ed501d93fa364841              BBL         [2018]       0         0   
991   2bb857b91be84183              BBL         [2018]       0         6   
1169  cd2b5040a5614e96              BBL         [2018]       0         2   

                team_name team_short_name team_image_url  
0     Melbourne Renegades             REN                 
178         Brisbane Heat            HEAT                 
346        Sydney Thunder            THUN                 
502     Adelaide Strikers             STR                 
667         Sydney S

In [55]:
teams_df = teams_df.rename(columns={"team_id": "src_team_id"})

In [56]:
print(teams_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 0 to 1169
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   src_team_id       8 non-null      object
 1   competition_name  8 non-null      object
 2   seasons_played    8 non-null      object
 3   titles            8 non-null      int64 
 4   playoffs          8 non-null      int64 
 5   team_name         8 non-null      object
 6   team_short_name   8 non-null      object
 7   team_image_url    8 non-null      object
dtypes: int64(2), object(6)
memory usage: 576.0+ bytes
None


In [57]:
import datetime

# Get the current timestamp
load_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Define the existing teams count and the current session's team count
existing_teams_count = 50
current_session_teams_count = len(teams_df)

# Generate team IDs based on the existing and current session's team count
team_ids = range(
    existing_teams_count, existing_teams_count + current_session_teams_count
)

# Update the teams_df DataFrame with the new columns
teams_df["team_id"] = team_ids
teams_df["load_timestamp"] = load_timestamp

In [58]:
teams_df.set_index("team_id", inplace=True)

# Print the updated teams_df DataFrame
print(teams_df)

              src_team_id competition_name seasons_played  titles  playoffs  \
team_id                                                                       
50       2a8fb27ae8f542db              BBL         [2018]       2         4   
51       a4d43ed3eb074b4c              BBL         [2018]       0         0   
52       276819a42e064cf4              BBL         [2018]       0         0   
53       e0d0177847684261              BBL         [2018]       0         0   
54       47833ee7613144ec              BBL         [2018]       0         4   
55       ed501d93fa364841              BBL         [2018]       0         0   
56       2bb857b91be84183              BBL         [2018]       0         6   
57       cd2b5040a5614e96              BBL         [2018]       0         2   

                   team_name team_short_name team_image_url  \
team_id                                                       
50       Melbourne Renegades             REN                  
51             Brisb

In [59]:
import duckdb
from deltalake import DeltaTable
from deltalake.writer import write_deltalake
import pickle

import os
from dotenv import load_dotenv

local_delta_lake_path = os.getenv("local_delta_lake_path")


def update_existing_data(serialized_df, table_name):
    df = pickle.loads(serialized_df)
    table_path = f"{local_delta_lake_path}/{table_name}"
    write_deltalake(table_path, df)

In [60]:
serialized_df = pickle.dumps(teams_df)

In [61]:
print(teams_df)
# update_existing_data(serialized_df, "teams")

              src_team_id competition_name seasons_played  titles  playoffs  \
team_id                                                                       
50       2a8fb27ae8f542db              BBL         [2018]       2         4   
51       a4d43ed3eb074b4c              BBL         [2018]       0         0   
52       276819a42e064cf4              BBL         [2018]       0         0   
53       e0d0177847684261              BBL         [2018]       0         0   
54       47833ee7613144ec              BBL         [2018]       0         4   
55       ed501d93fa364841              BBL         [2018]       0         0   
56       2bb857b91be84183              BBL         [2018]       0         6   
57       cd2b5040a5614e96              BBL         [2018]       0         2   

                   team_name team_short_name team_image_url  \
team_id                                                       
50       Melbourne Renegades             REN                  
51             Brisb