In [None]:
# packages
from datetime import datetime
import os
import pandas as pd
from pathlib import Path
import concurrent.futures
import json
import warnings
import inflection


def read_and_parse_file(file_path, start_index, end_index):
    with open(file_path, "r") as file:
        js_object = file.read()
    data = js_object.strip()[start_index:-end_index]
    json_data = json.loads(data)
    return json_data


def read_json_files(folder_path, data_type, start_index, end_index):
    json_data_list = []
    file_paths = list(Path(folder_path).rglob(f"*{data_type}.js"))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(read_and_parse_file, path, start_index, end_index)
            for path in file_paths
        ]

        for future in concurrent.futures.as_completed(futures):
            json_data = future.result()
            json_data_list.append(json_data)

    return json_data_list


def read_csv_files(folder_path):
    dataframes = []
    file_paths = list(Path(folder_path).rglob("*.csv"))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(pd.read_csv, path, dtype={"column20": str, "column21": str})
            for path in file_paths
        ]

        for future in concurrent.futures.as_completed(futures):
            df = future.result()
            dataframes.append(df)

    final_dataframe = pd.concat(dataframes, ignore_index=True)
    return final_dataframe


def combine_squad_data(json_list):
    comined_json_list = []

    for json_obj in json_list:
        squad_a = json_obj.get("squadA", [])
        squad_b = json_obj.get("squadB", [])

        divided_json_1 = {"squad": squad_a}
        divided_json_2 = {"squad": squad_b}

        comined_json_list.append(divided_json_1)
        comined_json_list.append(divided_json_2)

    return comined_json_list


def combine_innings_data(innings_1_json_data_list, innings_2_json_data_list):
    combined_data = []

    # Append the JSON objects from the first list
    for innings_1_json_data in innings_1_json_data_list:
        combined_data.append(innings_1_json_data["Innings1"])

    # Append the JSON objects from the second list
    for innings_2_json_data in innings_2_json_data_list:
        combined_data.append(innings_2_json_data["Innings2"])

    return combined_data


def convert_json_list_to_df(json_data_list, key_name):
    dataframes = []
    for json_data in json_data_list:
        df = pd.DataFrame(json_data[key_name])
        dataframes.append(df)
    final_dataframe = pd.concat(dataframes, ignore_index=True)

    return final_dataframe

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

data_path = os.getenv("data_path")

data_feeds_path = f"{data_path}/Data_Feeds"
squad_feeds_path = f"{data_path}/Squad_Feeds"
other_tournament_data_path = f"{data_path}/other-tournament-data"

# file name of collected data
match_schedule_file_name = "matchSchedule"
match_squad_file_name = "squad"
match_summary_file_name = "matchsummary"
match_inning1_file_name = "Innings1"
match_inning2_file_name = "Innings2"

# key value of relevant data in json objects of collected data
match_result_key = "Result"
match_player_key = "squad"
match_summary_key = "MatchSummary"
match_batting_card_key = "BattingCard"
match_extras_key = "Extras"
match_fall_of_wickets_key = "FallOfWickets"
match_wagon_wheel_key = "WagonWheel"
match_partnership_scores_key = "PartnershipScores"
match_partnership_break_key = "PartnershipBreak"
match_bowling_card_key = "BowlingCard"
match_manhattan_graph_key = "ManhattanGraph"
match_manhattan_wickets_key = "ManhattanWickets"
match_over_history_key = "OverHistory"
match_wagon_wheel_summary_key = "WagonWheelSummary"
match_batting_head_to_head_key = "battingheadtohead"
match_bowling_head_to_head_key = "battingheadtohead"

In [None]:
def load_match_result_data():
    match_schedule_json_data_list = read_json_files(
        data_feeds_path, match_schedule_file_name, 14, 2
    )
    return convert_json_list_to_df(match_schedule_json_data_list, match_result_key)


def load_match_player_data():
    match_squad_json_data_list = read_json_files(
        squad_feeds_path, match_squad_file_name, 8, 2
    )
    match_player_json_data_list = combine_squad_data(match_squad_json_data_list)
    return convert_json_list_to_df(match_player_json_data_list, match_player_key)


def load_match_summary_data():
    match_summary_json_data_list = read_json_files(
        data_feeds_path, match_summary_file_name, 22, 2
    )
    return convert_json_list_to_df(match_summary_json_data_list, match_summary_key)


def load_match_innings_data():
    innings_1_json_data_list = read_json_files(
        data_feeds_path, match_inning1_file_name, 10, 2
    )
    innings_2_json_data_list = read_json_files(
        data_feeds_path, match_inning2_file_name, 10, 2
    )
    innings_json_data_list = combine_innings_data(
        innings_1_json_data_list, innings_2_json_data_list
    )
    match_batting_card = convert_json_list_to_df(
        innings_json_data_list, match_batting_card_key
    )
    match_extras = convert_json_list_to_df(innings_json_data_list, match_extras_key)
    match_fall_of_wickets = convert_json_list_to_df(
        innings_json_data_list, match_fall_of_wickets_key
    )
    match_wagon_wheel = convert_json_list_to_df(
        innings_json_data_list, match_wagon_wheel_key
    )
    match_partnership_scores = convert_json_list_to_df(
        innings_json_data_list, match_partnership_scores_key
    )
    match_partnership_break = convert_json_list_to_df(
        innings_json_data_list, match_partnership_break_key
    )
    match_bowling_card = convert_json_list_to_df(
        innings_json_data_list, match_bowling_card_key
    )
    match_manhattan_graph = convert_json_list_to_df(
        innings_json_data_list, match_manhattan_graph_key
    )
    match_manhattan_wickets = convert_json_list_to_df(
        innings_json_data_list, match_manhattan_wickets_key
    )
    match_over_history = convert_json_list_to_df(
        innings_json_data_list, match_over_history_key
    )
    match_wagon_wheel_summary = convert_json_list_to_df(
        innings_json_data_list, match_wagon_wheel_summary_key
    )
    match_batting_head_to_head = convert_json_list_to_df(
        innings_json_data_list, match_batting_head_to_head_key
    )
    match_bowling_head_to_head = convert_json_list_to_df(
        innings_json_data_list, match_bowling_head_to_head_key
    )

    dataframes = {
        "match_batting_card": match_batting_card,
        "match_extras": match_extras,
        "match_fall_of_wickets": match_fall_of_wickets,
        "match_wagon_wheel": match_wagon_wheel,
        "match_partnership_scores": match_partnership_scores,
        "match_partnership_break": match_partnership_break,
        "match_bowling_card": match_bowling_card,
        "match_manhattan_graph": match_manhattan_graph,
        "match_manhattan_wickets": match_manhattan_wickets,
        "match_over_history": match_over_history,
        "match_wagon_wheel_summary": match_wagon_wheel_summary,
        "match_batting_head_to_head": match_batting_head_to_head,
        "match_bowling_head_to_head": match_bowling_head_to_head,
    }

    return dataframes


def load_match_ball_data():
    match_balls = read_csv_files(other_tournament_data_path)
    return match_balls

In [None]:
match_result_data = load_match_result_data()
print(f"match results: {match_result_data.shape}")
print("Match Results Schema:")
print(match_result_data.info())

match_summary_data = load_match_summary_data()
print(f"match summaries: {match_summary_data.shape}")
print("Match Summaries Schema:")
print(match_summary_data.info())

match_player_data = load_match_player_data()
print(f"match players: {match_player_data.shape}")
print("Match Players Schema:")
print(match_player_data.info())

match_ball_data = load_match_ball_data()
print(f"match balls: {match_ball_data.shape}")
print("Match Balls Schema:")
print(match_ball_data.info())

innings_dataframes = load_match_innings_data()

for name, dataframe in innings_dataframes.items():
    print(f"\n{name} DataFrame:")
    print("Shape:", dataframe.shape)
    print("Columns:", dataframe.info())

In [5]:
print(match_result_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   CompetitionID          118 non-null    object
 1   MatchID                118 non-null    object
 2   MatchTypeID            118 non-null    object
 3   MatchType              118 non-null    object
 4   MatchStatus            118 non-null    object
 5   MatchName              118 non-null    object
 6   MatchDate              118 non-null    object
 7   MatchTime              118 non-null    object
 8   FirstBattingTeamID     118 non-null    object
 9   FirstBattingTeamName   118 non-null    object
 10  HomeTeamLogo           118 non-null    object
 11  SecondBattingTeamID    118 non-null    object
 12  SecondBattingTeamName  118 non-null    object
 13  AwayTeamLogo           118 non-null    object
 14  GroundID               118 non-null    object
 15  GroundName             

In [8]:
venue_df = (
    match_result_data.loc[:, ["GroundID", "GroundName"]]
    .drop_duplicates()
    .rename(columns={"GroundID": "src_venue_id", "GroundName": "stadium_name"})
)

In [9]:
print(venue_df)

        src_venue_id                              stadium_name
0   2d76e13f3def482f                         DOCKLANDS STADIUM
2   462dad2f1cf347ce                            BELLERIVE OVAL
3   fd4b3ccd1f524288                  MELBOURNE CRICKET GROUND
4   b94d68f812c1498a                             ADELAIDE OVAL
5   3908affef270416e                               MANUKA OVAL
6   3b32cf9b18174328                   BRISBANE CRICKET GROUND
8   f93b47c36dce4767                             PERTH STADIUM
10  09b368d6cd4f417b                     SYDNEY CRICKET GROUND
12  f79ae8f54fa143cf                            AURORA STADIUM
18  b8224b1ac3ab4974                    TED SUMMERTON  RESERVE
20  abfe4c602ff846b5                 SYDNEY SHOWGROUND STADIUM
30  d223189b34c54f27                    GEELONG CRICKET GROUND
39  9e8d95386515453d                              CARRARA OVAL
41  a537f5d0b2934847  SIMONDS STADIUM, SOUTH GEELONG, VICTORIA
55  d4443da63aaf4426                          METRICON 

In [10]:
import datetime

load_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

venue_df["load_timestamp"] = load_timestamp

In [11]:
venue_df["stadium_name"] = venue_df["stadium_name"].apply(lambda x: x.upper())

In [12]:
venue_df = venue_df.drop_duplicates(subset="stadium_name", keep="first")

In [14]:
# Define the existing venues count and the current session's venue count
existing_venues_count = 50
current_session_venues_count = len(venue_df)

# Generate venue IDs based on the existing and current session's venue count
venue_ids = range(
    existing_venues_count, existing_venues_count + current_session_venues_count
)

# Update the venue_df DataFrame with the new columns
venue_df["venue_id"] = venue_ids

In [15]:
print(venue_df)

        src_venue_id                              stadium_name  \
0   2d76e13f3def482f                         DOCKLANDS STADIUM   
2   462dad2f1cf347ce                            BELLERIVE OVAL   
3   fd4b3ccd1f524288                  MELBOURNE CRICKET GROUND   
4   b94d68f812c1498a                             ADELAIDE OVAL   
5   3908affef270416e                               MANUKA OVAL   
6   3b32cf9b18174328                   BRISBANE CRICKET GROUND   
8   f93b47c36dce4767                             PERTH STADIUM   
10  09b368d6cd4f417b                     SYDNEY CRICKET GROUND   
12  f79ae8f54fa143cf                            AURORA STADIUM   
18  b8224b1ac3ab4974                    TED SUMMERTON  RESERVE   
20  abfe4c602ff846b5                 SYDNEY SHOWGROUND STADIUM   
30  d223189b34c54f27                    GEELONG CRICKET GROUND   
39  9e8d95386515453d                              CARRARA OVAL   
41  a537f5d0b2934847  SIMONDS STADIUM, SOUTH GEELONG, VICTORIA   
55  d4443d

In [16]:
venue_df.set_index("venue_id", inplace=True)

# Print the updated venue_df DataFrame
print(venue_df)

              src_venue_id                              stadium_name  \
venue_id                                                               
50        2d76e13f3def482f                         DOCKLANDS STADIUM   
51        462dad2f1cf347ce                            BELLERIVE OVAL   
52        fd4b3ccd1f524288                  MELBOURNE CRICKET GROUND   
53        b94d68f812c1498a                             ADELAIDE OVAL   
54        3908affef270416e                               MANUKA OVAL   
55        3b32cf9b18174328                   BRISBANE CRICKET GROUND   
56        f93b47c36dce4767                             PERTH STADIUM   
57        09b368d6cd4f417b                     SYDNEY CRICKET GROUND   
58        f79ae8f54fa143cf                            AURORA STADIUM   
59        b8224b1ac3ab4974                    TED SUMMERTON  RESERVE   
60        abfe4c602ff846b5                 SYDNEY SHOWGROUND STADIUM   
61        d223189b34c54f27                    GEELONG CRICKET GR

In [17]:
import duckdb
from deltalake import DeltaTable
from deltalake.writer import write_deltalake
import pickle

import os
from dotenv import load_dotenv

local_delta_lake_path = os.getenv("local_delta_lake_path")


def update_existing_data(serialized_df, table_name):
    df = pickle.loads(serialized_df)
    table_path = f"{local_delta_lake_path}/{table_name}"
    write_deltalake(table_path, df)

In [18]:
serialized_df = pickle.dumps(venue_df)
update_existing_data(serialized_df, "venues")