In [1]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from multielo import MultiElo, Tracker
from mktools.get_data import load_data_pd
from mktools.validate_data import validate_bad_uids
import plotly.express as px
from alive_progress import alive_it
from bs4 import BeautifulSoup

# Load Variables from .env file
load_dotenv()

True

In [2]:
form_df = load_data_pd(sheet_name="form_data", sheet_id=os.environ["SHEET_ID"])

form_df = form_df.drop(
    columns=[
        x
        for x in form_df.columns
        if x.__contains__("Unnamed") or x.__contains__("Score")
    ]
)

form_df["Timestamp"] = pd.to_datetime(form_df["Timestamp"])

form_df.head()

Unnamed: 0,Timestamp,NEW_SESSION,MAP,PLAYERS,PLAYERS_2 [1ST],PLAYERS_2 [2ND],CHARACTERS_2 [1ST],CHARACTERS_2 [2ND],PLAYERS_3 [1ST],PLAYERS_3 [2ND],...,CHARACTERS_3 [2ND],CHARACTERS_3 [3RD],PLAYERS_4 [1ST],PLAYERS_4 [2ND],PLAYERS_4 [3RD],PLAYERS_4 [4TH],CHARACTERS_4 [1ST],CHARACTERS_4 [2ND],CHARACTERS_4 [3RD],CHARACTERS_4 [4TH]
0,2024-07-17 19:15:48,NO,Koopa Troopa Beach,3,,,,,Cooper,Regan,...,Yoshi,Toad,,,,,,,,
1,2024-07-17 20:01:53,NO,Choco Mountain,4,,,,,,,...,,,Cooper,Connor,Blake,Triston,Luigi,Toad,Peach,Yoshi
2,2024-07-17 20:15:00,NO,D.K.'s Jungle,4,,,,,,,...,,,Cole,Regan,Cooper,Triston,Toad,Yoshi,Mario,Peach
3,2024-07-17 20:42:27,NO,Kalimari Desert,4,,,,,,,...,,,Regan,Cole,Cooper,Blake,Yoshi,Toad,Mario,Peach
4,2024-07-17 21:13:04,NO,Frappe Snowland,3,,,,,Cooper,Regan,...,Toad,Peach,,,,,,,,


In [3]:
form_df.columns = [
    x.replace(" ", "_").replace("[", "").replace("]", "") if x.__contains__("[") else x
    for x in form_df.columns
]

form_df

Unnamed: 0,Timestamp,NEW_SESSION,MAP,PLAYERS,PLAYERS_2_1ST,PLAYERS_2_2ND,CHARACTERS_2_1ST,CHARACTERS_2_2ND,PLAYERS_3_1ST,PLAYERS_3_2ND,...,CHARACTERS_3_2ND,CHARACTERS_3_3RD,PLAYERS_4_1ST,PLAYERS_4_2ND,PLAYERS_4_3RD,PLAYERS_4_4TH,CHARACTERS_4_1ST,CHARACTERS_4_2ND,CHARACTERS_4_3RD,CHARACTERS_4_4TH
0,2024-07-17 19:15:48,NO,Koopa Troopa Beach,3,,,,,Cooper,Regan,...,Yoshi,Toad,,,,,,,,
1,2024-07-17 20:01:53,NO,Choco Mountain,4,,,,,,,...,,,Cooper,Connor,Blake,Triston,Luigi,Toad,Peach,Yoshi
2,2024-07-17 20:15:00,NO,D.K.'s Jungle,4,,,,,,,...,,,Cole,Regan,Cooper,Triston,Toad,Yoshi,Mario,Peach
3,2024-07-17 20:42:27,NO,Kalimari Desert,4,,,,,,,...,,,Regan,Cole,Cooper,Blake,Yoshi,Toad,Mario,Peach
4,2024-07-17 21:13:04,NO,Frappe Snowland,3,,,,,Cooper,Regan,...,Toad,Peach,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105,2025-01-07 20:52:41,NO,Frappe Snowland,4,,,,,,,...,,,Regan,Cole,Cooper,Blake,Toad,Peach,Mario,Yoshi
2106,2025-01-07 21:44:34,NO,Royal Raceway,4,,,,,,,...,,,Blake,Regan,Antonio,Connor,Yoshi,Toad,Bowser,Peach
2107,2025-01-07 22:04:47,NO,Toad's Turnpike,4,,,,,,,...,,,Regan,Blake,Antonio,Triston,Toad,Yoshi,Bowser,Peach
2108,2025-01-07 22:18:27,NO,Yoshi Valley,4,,,,,,,...,,,Antonio,Cole,Cooper,Blake,Peach,Toad,Yoshi,Bowser


In [4]:
fdf = form_df.copy()

In [5]:
dfs = []

for game_type in [2, 3, 4]:
    tdf = fdf[fdf["PLAYERS"] == game_type].copy().reset_index(drop=True)

    dfs.append(tdf)

In [6]:
two_p = dfs[0]
three_p = dfs[1]
four_p = dfs[2]

two_p.head()

Unnamed: 0,Timestamp,NEW_SESSION,MAP,PLAYERS,PLAYERS_2_1ST,PLAYERS_2_2ND,CHARACTERS_2_1ST,CHARACTERS_2_2ND,PLAYERS_3_1ST,PLAYERS_3_2ND,...,CHARACTERS_3_2ND,CHARACTERS_3_3RD,PLAYERS_4_1ST,PLAYERS_4_2ND,PLAYERS_4_3RD,PLAYERS_4_4TH,CHARACTERS_4_1ST,CHARACTERS_4_2ND,CHARACTERS_4_3RD,CHARACTERS_4_4TH
0,2024-07-18 21:05:00,NO,Toad's Turnpike,2,Cooper,Regan,Peach,Toad,,,...,,,,,,,,,,
1,2024-07-21 16:32:47,YES,Bowser's Castle,2,Matt,Cooper,Toad,Peach,,,...,,,,,,,,,,
2,2024-07-21 16:56:38,NO,Yoshi Valley,2,Matt,Cooper,Peach,Yoshi,,,...,,,,,,,,,,
3,2024-07-24 13:46:27,YES,Wario Stadium,2,Regan,Martin,Toad,Yoshi,,,...,,,,,,,,,,
4,2024-07-24 13:47:06,NO,Toad's Turnpike,2,Regan,Martin,D.K.,Yoshi,,,...,,,,,,,,,,


In [7]:
from typing import Literal


def transform_form_data(
    df: pd.DataFrame, game_type_replace_string: Literal["_2_", "_3_", "_4_"]
) -> pd.DataFrame:

    idf = df.copy()

    idf_out = (
        idf.dropna(axis=1, how="all")
        .reset_index(drop=True)
        .reset_index()
        .rename(columns={"index": "ID"})
    )

    idf_out.columns = [
        (
            x.replace(game_type_replace_string, "_")
            if x.__contains__(game_type_replace_string)
            else x.upper()
        )
        for x in idf_out.columns
    ]

    idf_out["TIMESTAMP"] = (
        pd.to_datetime(idf_out["TIMESTAMP"])
        .dt.tz_localize("US/Eastern")
        .dt.tz_convert("UTC")
    )

    idf_out["ID"] = idf_out["ID"] + 1

    return idf_out

In [8]:
two_p_out = transform_form_data(df=two_p, game_type_replace_string="_2_")
three_p_out = transform_form_data(df=three_p, game_type_replace_string="_3_")
four_p_out = transform_form_data(df=four_p, game_type_replace_string="_4_")

two_p_out

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION,MAP,PLAYERS,PLAYERS_1ST,PLAYERS_2ND,CHARACTERS_1ST,CHARACTERS_2ND
0,1,2024-07-19 01:05:00+00:00,NO,Toad's Turnpike,2,Cooper,Regan,Peach,Toad
1,2,2024-07-21 20:32:47+00:00,YES,Bowser's Castle,2,Matt,Cooper,Toad,Peach
2,3,2024-07-21 20:56:38+00:00,NO,Yoshi Valley,2,Matt,Cooper,Peach,Yoshi
3,4,2024-07-24 17:46:27+00:00,YES,Wario Stadium,2,Regan,Martin,Toad,Yoshi
4,5,2024-07-24 17:47:06+00:00,NO,Toad's Turnpike,2,Regan,Martin,D.K.,Yoshi
...,...,...,...,...,...,...,...,...,...
133,134,2024-12-29 02:07:05+00:00,NO,Moo Moo Farm,2,Konnor,Garrett,Yoshi,Toad
134,135,2024-12-29 02:43:29+00:00,NO,Toad's Turnpike,2,Konnor,Regan,Yoshi,Toad
135,136,2024-12-30 01:39:28+00:00,NO,Banshee Boardwalk,2,Regan,Cole,Yoshi,Toad
136,137,2025-01-06 00:38:45+00:00,NO,Bowser's Castle,2,Cooper,Blake,Peach,Yoshi


In [9]:
three_p_out

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION,MAP,PLAYERS,PLAYERS_1ST,PLAYERS_2ND,PLAYERS_3RD,CHARACTERS_1ST,CHARACTERS_2ND,CHARACTERS_3RD
0,1,2024-07-17 23:15:48+00:00,NO,Koopa Troopa Beach,3,Cooper,Regan,Garrett,Peach,Yoshi,Toad
1,2,2024-07-18 01:13:04+00:00,NO,Frappe Snowland,3,Cooper,Regan,Blake,Mario,Toad,Peach
2,3,2024-07-18 23:42:21+00:00,NO,Sherbet Land,3,Regan,Blake,Cooper,Yoshi,Peach,Toad
3,4,2024-07-19 02:38:14+00:00,NO,Koopa Troopa Beach,3,Regan,Cooper,Konnor,Peach,Toad,Yoshi
4,5,2024-07-20 22:46:58+00:00,NO,Kalimari Desert,3,Regan,Triston,Cooper,Peach,Yoshi,Mario
...,...,...,...,...,...,...,...,...,...,...,...
189,190,2025-01-05 22:41:17+00:00,NO,Koopa Troopa Beach,3,Regan,Antonio,Garrett,Yoshi,Toad,Peach
190,191,2025-01-06 03:27:59+00:00,NO,Yoshi Valley,3,Regan,Connor,Garrett,Yoshi,Peach,Toad
191,192,2025-01-06 04:09:11+00:00,NO,D.K.'s Jungle,3,Regan,Blake,Garrett,Yoshi,Peach,Toad
192,193,2025-01-07 04:30:26+00:00,NO,Moo Moo Farm,3,Konnor,Connor,Cole,Toad,Peach,Yoshi


In [10]:
for idx, odf in enumerate([two_p_out, three_p_out, four_p_out]):
    v = idx + 2
    odf.to_csv(rf"C:\Users\Cooper\sandbox\mkstream\form_data_migration\data_{v}P.csv")

In [11]:
[len(x) for x in two_p_out["MAP"].unique()]

[15, 15, 12, 13, 13, 14, 15, 18, 12, 13, 13, 17, 12, 15, 13]

In [12]:
[len(x) for x in four_p_out["MAP"].unique()]

[14, 13, 15, 18, 13, 13, 12, 15, 12, 17, 15, 15, 13, 12, 13]

In [13]:
cat_df = (
    pd.concat([two_p_out, three_p_out, four_p_out])
    .sort_values(by="TIMESTAMP")
    .drop(columns=["ID"])
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "ID"})
)[
    [
        "ID",
        "TIMESTAMP",
        "NEW_SESSION",
        "MAP",
        "PLAYERS",
        "PLAYERS_1ST",
        "PLAYERS_2ND",
        "PLAYERS_3RD",
        "PLAYERS_4TH",
        "CHARACTERS_1ST",
        "CHARACTERS_2ND",
        "CHARACTERS_3RD",
        "CHARACTERS_4TH",
    ]
]

cat_df["ID"] = cat_df["ID"] + 1

cat_df.head()

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION,MAP,PLAYERS,PLAYERS_1ST,PLAYERS_2ND,PLAYERS_3RD,PLAYERS_4TH,CHARACTERS_1ST,CHARACTERS_2ND,CHARACTERS_3RD,CHARACTERS_4TH
0,1,2024-07-17 23:15:48+00:00,NO,Koopa Troopa Beach,3,Cooper,Regan,Garrett,,Peach,Yoshi,Toad,
1,2,2024-07-18 00:01:53+00:00,NO,Choco Mountain,4,Cooper,Connor,Blake,Triston,Luigi,Toad,Peach,Yoshi
2,3,2024-07-18 00:15:00+00:00,NO,D.K.'s Jungle,4,Cole,Regan,Cooper,Triston,Toad,Yoshi,Mario,Peach
3,4,2024-07-18 00:42:27+00:00,NO,Kalimari Desert,4,Regan,Cole,Cooper,Blake,Yoshi,Toad,Mario,Peach
4,5,2024-07-18 01:13:04+00:00,NO,Frappe Snowland,3,Cooper,Regan,Blake,,Mario,Toad,Peach,


In [14]:
tdf = cat_df.copy()

In [15]:
def fill_new_session(
    df: pd.DataFrame, drop_window_start_column: bool = True
) -> pd.DataFrame:

    tdf = df.copy()

    # Ensure 'TIMESTAMP' column is in datetime format (UTC)
    tdf["TIMESTAMP"] = pd.to_datetime(tdf["TIMESTAMP"])

    # Create a new column for NEW_SESSION and initially mark all as 'NO'
    tdf["NEW_SESSION"] = "NO"

    # Generate the window start (07:00 UTC) for each game
    tdf["window_start"] = tdf["TIMESTAMP"].dt.floor("D") + pd.Timedelta(hours=7)

    # Iterate through each row to identify the first game in each 24-hour window
    for idx, row in tdf.iterrows():

        window_start = row["window_start"]

        window_end = window_start + pd.Timedelta(days=1) - pd.Timedelta(nanoseconds=1)

        # Get all games within the 24-hour window
        games_in_window = tdf[
            (tdf["TIMESTAMP"] >= window_start) & (tdf["TIMESTAMP"] <= window_end)
        ]

        # Mark the first game in the window as 'YES'
        if row["TIMESTAMP"] == games_in_window["TIMESTAMP"].min():
            tdf.at[idx, "NEW_SESSION"] = "YES"

    if drop_window_start_column:
        tdf = tdf.drop(columns=["window_start"])

    return tdf

In [16]:
new_sesh = fill_new_session(df=tdf, drop_window_start_column=False)

In [17]:
new_sesh.to_csv(rf"C:\Users\Cooper\sandbox\mkstream\form_data_migration\temp_hmmm.csv")

In [18]:
new_sesh.to_clipboard(index=False)

In [19]:
new_sesh[["ID", "TIMESTAMP", "NEW_SESSION"]]

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION
0,1,2024-07-17 23:15:48+00:00,YES
1,2,2024-07-18 00:01:53+00:00,NO
2,3,2024-07-18 00:15:00+00:00,NO
3,4,2024-07-18 00:42:27+00:00,NO
4,5,2024-07-18 01:13:04+00:00,NO
...,...,...,...
2105,2106,2025-01-08 01:52:41+00:00,NO
2106,2107,2025-01-08 02:44:34+00:00,NO
2107,2108,2025-01-08 03:04:47+00:00,NO
2108,2109,2025-01-08 03:18:27+00:00,NO


In [20]:
new_sesh[new_sesh["NEW_SESSION"] != cat_df["NEW_SESSION"]]

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION,MAP,PLAYERS,PLAYERS_1ST,PLAYERS_2ND,PLAYERS_3RD,PLAYERS_4TH,CHARACTERS_1ST,CHARACTERS_2ND,CHARACTERS_3RD,CHARACTERS_4TH,window_start
0,1,2024-07-17 23:15:48+00:00,YES,Koopa Troopa Beach,3,Cooper,Regan,Garrett,,Peach,Yoshi,Toad,,2024-07-17 07:00:00+00:00
329,330,2024-08-12 01:03:28+00:00,NO,Royal Raceway,4,Regan,Connor,Domingo,Triston,Toad,Bowser,Peach,Yoshi,2024-08-12 07:00:00+00:00
404,405,2024-08-18 07:13:37+00:00,YES,Wario Stadium,3,Regan,Blake,Anthony,,Peach,Toad,Yoshi,,2024-08-18 07:00:00+00:00
406,407,2024-08-18 18:57:00+00:00,NO,Toad's Turnpike,4,Cooper,Connor,Blake,Domingo,Toad,Peach,Yoshi,Bowser,2024-08-18 07:00:00+00:00
467,468,2024-08-22 07:20:45+00:00,YES,D.K.'s Jungle,2,Domingo,Konnor,,,Yoshi,Peach,,,2024-08-22 07:00:00+00:00
468,469,2024-08-22 21:39:03+00:00,NO,Toad's Turnpike,4,Blake,Cooper,Regan,Garrett,Toad,Peach,Luigi,Yoshi,2024-08-22 07:00:00+00:00
1010,1011,2024-09-28 04:14:46+00:00,NO,Wario Stadium,4,Cole,Colton,Konnor,Triston,Luigi,Toad,Yoshi,Peach,2024-09-28 07:00:00+00:00
1011,1012,2024-09-28 17:36:28+00:00,YES,Toad's Turnpike,4,Blake,Regan,Garrett,Randy,Yoshi,Peach,Toad,Wario,2024-09-28 07:00:00+00:00
1192,1193,2024-10-15 00:45:57+00:00,NO,Yoshi Valley,4,Colton,Cole,Triston,Garrett,Yoshi,Toad,Peach,Luigi,2024-10-15 07:00:00+00:00
1307,1308,2024-10-25 00:00:34+00:00,NO,D.K.'s Jungle,4,Cole,Connor,Cooper,Luke,Yoshi,Toad,Peach,Mario,2024-10-25 07:00:00+00:00


In [21]:
id = 330

new_sesh[new_sesh["ID"].isin(range(id - 5, id + 10, 1))]

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION,MAP,PLAYERS,PLAYERS_1ST,PLAYERS_2ND,PLAYERS_3RD,PLAYERS_4TH,CHARACTERS_1ST,CHARACTERS_2ND,CHARACTERS_3RD,CHARACTERS_4TH,window_start
324,325,2024-08-11 00:59:46+00:00,NO,Wario Stadium,4,Blake,Cole,Colton,Regan,Yoshi,Peach,Bowser,Toad,2024-08-11 07:00:00+00:00
325,326,2024-08-11 01:47:17+00:00,NO,Kalimari Desert,4,Cole,Regan,Blake,Randy,Peach,Toad,Yoshi,D.K.,2024-08-11 07:00:00+00:00
326,327,2024-08-11 02:08:06+00:00,NO,Frappe Snowland,4,Cole,Blake,Regan,Colton,Yoshi,Peach,Toad,Bowser,2024-08-11 07:00:00+00:00
327,328,2024-08-11 03:27:31+00:00,NO,Yoshi Valley,4,Cole,Colton,Blake,Randy,Yoshi,Peach,Toad,D.K.,2024-08-11 07:00:00+00:00
328,329,2024-08-11 20:57:54+00:00,YES,Wario Stadium,3,Blake,Connor,Domingo,,Yoshi,Peach,Toad,,2024-08-11 07:00:00+00:00
329,330,2024-08-12 01:03:28+00:00,NO,Royal Raceway,4,Regan,Connor,Domingo,Triston,Toad,Bowser,Peach,Yoshi,2024-08-12 07:00:00+00:00
330,331,2024-08-12 02:27:09+00:00,NO,Kalimari Desert,3,Konnor,Regan,Domingo,,Yoshi,Peach,Toad,,2024-08-12 07:00:00+00:00
331,332,2024-08-12 03:00:55+00:00,NO,Choco Mountain,3,Regan,Konnor,Domingo,,Yoshi,Peach,Toad,,2024-08-12 07:00:00+00:00
332,333,2024-08-12 22:30:40+00:00,YES,Wario Stadium,4,Blake,Regan,Connor,Cole,Yoshi,Toad,Bowser,Peach,2024-08-12 07:00:00+00:00
333,334,2024-08-12 22:42:12+00:00,NO,Yoshi Valley,3,Cole,Regan,Connor,,Peach,Toad,Yoshi,,2024-08-12 07:00:00+00:00


In [22]:
id = 330

cat_df[cat_df["ID"].isin(range(id-5, id+10, 1))]

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION,MAP,PLAYERS,PLAYERS_1ST,PLAYERS_2ND,PLAYERS_3RD,PLAYERS_4TH,CHARACTERS_1ST,CHARACTERS_2ND,CHARACTERS_3RD,CHARACTERS_4TH
324,325,2024-08-11 00:59:46+00:00,NO,Wario Stadium,4,Blake,Cole,Colton,Regan,Yoshi,Peach,Bowser,Toad
325,326,2024-08-11 01:47:17+00:00,NO,Kalimari Desert,4,Cole,Regan,Blake,Randy,Peach,Toad,Yoshi,D.K.
326,327,2024-08-11 02:08:06+00:00,NO,Frappe Snowland,4,Cole,Blake,Regan,Colton,Yoshi,Peach,Toad,Bowser
327,328,2024-08-11 03:27:31+00:00,NO,Yoshi Valley,4,Cole,Colton,Blake,Randy,Yoshi,Peach,Toad,D.K.
328,329,2024-08-11 20:57:54+00:00,YES,Wario Stadium,3,Blake,Connor,Domingo,,Yoshi,Peach,Toad,
329,330,2024-08-12 01:03:28+00:00,YES,Royal Raceway,4,Regan,Connor,Domingo,Triston,Toad,Bowser,Peach,Yoshi
330,331,2024-08-12 02:27:09+00:00,NO,Kalimari Desert,3,Konnor,Regan,Domingo,,Yoshi,Peach,Toad,
331,332,2024-08-12 03:00:55+00:00,NO,Choco Mountain,3,Regan,Konnor,Domingo,,Yoshi,Peach,Toad,
332,333,2024-08-12 22:30:40+00:00,YES,Wario Stadium,4,Blake,Regan,Connor,Cole,Yoshi,Toad,Bowser,Peach
333,334,2024-08-12 22:42:12+00:00,NO,Yoshi Valley,3,Cole,Regan,Connor,,Peach,Toad,Yoshi,


In [23]:
cat_df[["ID", "TIMESTAMP", "NEW_SESSION"]]

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION
0,1,2024-07-17 23:15:48+00:00,NO
1,2,2024-07-18 00:01:53+00:00,NO
2,3,2024-07-18 00:15:00+00:00,NO
3,4,2024-07-18 00:42:27+00:00,NO
4,5,2024-07-18 01:13:04+00:00,NO
...,...,...,...
2105,2106,2025-01-08 01:52:41+00:00,NO
2106,2107,2025-01-08 02:44:34+00:00,NO
2107,2108,2025-01-08 03:04:47+00:00,NO
2108,2109,2025-01-08 03:18:27+00:00,NO


In [24]:
cat_df.columns

Index(['ID', 'TIMESTAMP', 'NEW_SESSION', 'MAP', 'PLAYERS', 'PLAYERS_1ST',
       'PLAYERS_2ND', 'PLAYERS_3RD', 'PLAYERS_4TH', 'CHARACTERS_1ST',
       'CHARACTERS_2ND', 'CHARACTERS_3RD', 'CHARACTERS_4TH'],
      dtype='object')

In [25]:
cat_df

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION,MAP,PLAYERS,PLAYERS_1ST,PLAYERS_2ND,PLAYERS_3RD,PLAYERS_4TH,CHARACTERS_1ST,CHARACTERS_2ND,CHARACTERS_3RD,CHARACTERS_4TH
0,1,2024-07-17 23:15:48+00:00,NO,Koopa Troopa Beach,3,Cooper,Regan,Garrett,,Peach,Yoshi,Toad,
1,2,2024-07-18 00:01:53+00:00,NO,Choco Mountain,4,Cooper,Connor,Blake,Triston,Luigi,Toad,Peach,Yoshi
2,3,2024-07-18 00:15:00+00:00,NO,D.K.'s Jungle,4,Cole,Regan,Cooper,Triston,Toad,Yoshi,Mario,Peach
3,4,2024-07-18 00:42:27+00:00,NO,Kalimari Desert,4,Regan,Cole,Cooper,Blake,Yoshi,Toad,Mario,Peach
4,5,2024-07-18 01:13:04+00:00,NO,Frappe Snowland,3,Cooper,Regan,Blake,,Mario,Toad,Peach,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105,2106,2025-01-08 01:52:41+00:00,NO,Frappe Snowland,4,Regan,Cole,Cooper,Blake,Toad,Peach,Mario,Yoshi
2106,2107,2025-01-08 02:44:34+00:00,NO,Royal Raceway,4,Blake,Regan,Antonio,Connor,Yoshi,Toad,Bowser,Peach
2107,2108,2025-01-08 03:04:47+00:00,NO,Toad's Turnpike,4,Regan,Blake,Antonio,Triston,Toad,Yoshi,Bowser,Peach
2108,2109,2025-01-08 03:18:27+00:00,NO,Yoshi Valley,4,Antonio,Cole,Cooper,Blake,Peach,Toad,Yoshi,Bowser


In [26]:
cat_df_out = cat_df[
    [
        "ID",
        "TIMESTAMP",
        "NEW_SESSION",
        "MAP",
        "PLAYERS",
        "PLAYERS_1ST",
        "PLAYERS_2ND",
        "PLAYERS_3RD",
        "PLAYERS_4TH",
        "CHARACTERS_1ST",
        "CHARACTERS_2ND",
        "CHARACTERS_3RD",
        "CHARACTERS_4TH",
    ]
].copy()

In [27]:
cat_df_out

Unnamed: 0,ID,TIMESTAMP,NEW_SESSION,MAP,PLAYERS,PLAYERS_1ST,PLAYERS_2ND,PLAYERS_3RD,PLAYERS_4TH,CHARACTERS_1ST,CHARACTERS_2ND,CHARACTERS_3RD,CHARACTERS_4TH
0,1,2024-07-17 23:15:48+00:00,NO,Koopa Troopa Beach,3,Cooper,Regan,Garrett,,Peach,Yoshi,Toad,
1,2,2024-07-18 00:01:53+00:00,NO,Choco Mountain,4,Cooper,Connor,Blake,Triston,Luigi,Toad,Peach,Yoshi
2,3,2024-07-18 00:15:00+00:00,NO,D.K.'s Jungle,4,Cole,Regan,Cooper,Triston,Toad,Yoshi,Mario,Peach
3,4,2024-07-18 00:42:27+00:00,NO,Kalimari Desert,4,Regan,Cole,Cooper,Blake,Yoshi,Toad,Mario,Peach
4,5,2024-07-18 01:13:04+00:00,NO,Frappe Snowland,3,Cooper,Regan,Blake,,Mario,Toad,Peach,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105,2106,2025-01-08 01:52:41+00:00,NO,Frappe Snowland,4,Regan,Cole,Cooper,Blake,Toad,Peach,Mario,Yoshi
2106,2107,2025-01-08 02:44:34+00:00,NO,Royal Raceway,4,Blake,Regan,Antonio,Connor,Yoshi,Toad,Bowser,Peach
2107,2108,2025-01-08 03:04:47+00:00,NO,Toad's Turnpike,4,Regan,Blake,Antonio,Triston,Toad,Yoshi,Bowser,Peach
2108,2109,2025-01-08 03:18:27+00:00,NO,Yoshi Valley,4,Antonio,Cole,Cooper,Blake,Peach,Toad,Yoshi,Bowser


In [28]:
cat_df_out.to_csv(rf"C:\Users\Cooper\sandbox\mkstream\form_data_migration\form_data.csv", index=False)

In [29]:
cat_df_out.drop(columns=["ID"]).to_csv(rf"C:\Users\Cooper\sandbox\mkstream\form_data_migration\form_data_no_id_col.csv", index=False)

In [30]:
pd.Series([len(x) for x in four_p_out["CHARACTERS_1ST"].unique()]).max()

6

In [31]:
four_p_out["CHARACTERS_1ST"].unique()

array(['Luigi', 'Toad', 'Yoshi', 'Wario', 'Peach', 'Mario', 'Bowser',
       'D.K.'], dtype=object)

In [32]:
pd.Series([len(x) for x in four_p_out["PLAYERS_1ST"].unique()]).max()

8

## Octets Check

In [33]:
octets_df = load_data_pd(sheet_name="octets", sheet_id=os.environ["SHEET_ID"])

octets_df = octets_df.drop(
    columns=[
        x
        for x in octets_df.columns
        if x.__contains__("Unnamed")
    ]
)

octets_df

Unnamed: 0,NAME,COUNT,SEASON,SUID
0,Cooper,8,0,1
1,Cooper,8,0,4
2,Cooper,11,0,10
3,Cooper,9,0,13
4,Cooper,9,0,19
...,...,...,...,...
82,Regan,8,14,848
83,Cole,9,14,849
84,Luke,8,14,850
85,Cooper,9,14,858


In [34]:
octets_gb = (
    octets_df.groupby(by=["NAME"])
    .agg(TOTAL_OCTETS=pd.NamedAgg("COUNT", "count"))
    .reset_index()
    .sort_values(by=["TOTAL_OCTETS"], ascending=False)
    .reset_index(drop=True)
)

octets_gb

Unnamed: 0,NAME,TOTAL_OCTETS
0,Cooper,63
1,Cole,9
2,Regan,5
3,Blake,3
4,Connor,3
5,Matt,3
6,Luke,1


In [35]:
octets_per_season_gb = (
    octets_df.groupby(by=["NAME", "SEASON"])
    .agg(
        TOTAL_OCTETS=pd.NamedAgg("COUNT", "count"),
        SUID_LIST=pd.NamedAgg("SUID", list),
        AVERAGE_WINS_PER_OCTET=pd.NamedAgg("COUNT", "mean"),
        MEDIAN_WINS_PER_OCTET=pd.NamedAgg("COUNT", "median"),
        MAX_WINS=pd.NamedAgg("COUNT", "max"),
    )
    .reset_index()
    .sort_values(by=["SEASON", "TOTAL_OCTETS"], ascending=[True, False])
    .reset_index(drop=True)
)

octets_per_season_gb["SUID_LIST"] = (
    octets_per_season_gb["SUID_LIST"]
    .astype(str)
    .str.replace("[", "")
    .str.replace("]", "")
)

octets_per_season_gb

Unnamed: 0,NAME,SEASON,TOTAL_OCTETS,SUID_LIST,AVERAGE_WINS_PER_OCTET,MEDIAN_WINS_PER_OCTET,MAX_WINS
0,Cooper,0,9,"1, 4, 10, 13, 19, 20, 23, 26, 39",9.111111,9.0,11
1,Blake,0,1,40,9.0,9.0,9
2,Cole,0,1,38,8.0,8.0,8
3,Connor,0,1,22,8.0,8.0,8
4,Regan,1,1,62,8.0,8.0,8
5,Connor,2,1,210,8.0,8.0,8
6,Cooper,2,1,195,8.0,8.0,8
7,Regan,2,1,177,9.0,9.0,9
8,Blake,3,2,"250, 263",9.0,9.0,9
9,Cole,3,1,270,8.0,8.0,8


### Save

In [36]:
before_or_after = "after"

octets_gb.to_csv(
    rf"C:\Users\Cooper\sandbox\mkstream\form_data_migration\octets_total_groupby_{before_or_after}.csv",
    index=False,
)

octets_per_season_gb.to_csv(
    rf"C:\Users\Cooper\sandbox\mkstream\form_data_migration\octets_per_season_groupby_{before_or_after}.csv",
    index=False,
)

### Compare

In [37]:
octets_gb_before = pd.read_csv(
    r"C:\Users\Cooper\sandbox\mkstream\form_data_migration\octets_total_groupby_before.csv"
)

octets_per_season_gb_before = pd.read_csv(
    r"C:\Users\Cooper\sandbox\mkstream\form_data_migration\octets_per_season_groupby_before.csv"
)

octets_gb_after = pd.read_csv(
    r"C:\Users\Cooper\sandbox\mkstream\form_data_migration\octets_total_groupby_after.csv"
)

octets_per_season_gb_after = pd.read_csv(
    r"C:\Users\Cooper\sandbox\mkstream\form_data_migration\octets_per_season_groupby_after.csv"
)

In [38]:
octets_gb_after

Unnamed: 0,NAME,TOTAL_OCTETS
0,Cooper,63
1,Cole,9
2,Regan,5
3,Blake,3
4,Connor,3
5,Matt,3
6,Luke,1


In [39]:
octets_gb_before

Unnamed: 0,NAME,TOTAL_OCTETS
0,Cooper,63
1,Cole,9
2,Regan,5
3,Blake,3
4,Connor,3
5,Matt,3
6,Luke,1


In [40]:
octets_gb_after == octets_gb_before

Unnamed: 0,NAME,TOTAL_OCTETS
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True
5,True,True
6,True,True


In [41]:
octets_per_season_gb_after == octets_per_season_gb_before

Unnamed: 0,NAME,SEASON,TOTAL_OCTETS,SUID_LIST,AVERAGE_WINS_PER_OCTET,MEDIAN_WINS_PER_OCTET,MAX_WINS
0,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True
