In [None]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from multielo import MultiElo, Tracker
from mktools.get_data import load_data_pd
from mktools.validate_data import validate_bad_uids
from mktools.form_data import fill_new_session
import plotly.express as px
from alive_progress import alive_it
from bs4 import BeautifulSoup
from typing import Literal

# Load Variables from .env file
load_dotenv()

# LAST_FORM_DATA_DATE = pd.Timestamp('1/27/2025 23:40:20').tz_localize("US/Eastern").tz_convert("UTC")

# LAST_FORM_DATA_DATE

In [None]:
migrated_data = pd.read_csv(
    r"C:\Users\Cooper\sandbox\mkstream\form_data_migration\form_data_valid.csv"
).drop(columns=["SUID_WINDOW_START", "SUID_WINDOW_END"])

migrated_data["TIMESTAMP"] = pd.to_datetime(
    migrated_data["TIMESTAMP"], utc=True
).dt.tz_convert("US/Eastern")

In [None]:
pd.Timestamp('2024-07-09 08:40:00').tz_localize("US/Eastern")

In [None]:
pd.Timestamp("2025-01-29 03:58:15").tz_localize("US/Eastern")

In [None]:
# # Temporarily remove bad data can remove later
# migrated_data = migrated_data[
#     ~migrated_data["TIMESTAMP"].isin(
#         [
#             pd.Timestamp("2024-07-09 08:40:00").tz_localize("US/Eastern"),
#             pd.Timestamp("2024-07-09 08:35:00").tz_localize("US/Eastern"),
#         ]
#     )
# ]

In [None]:
LAST_FORM_DATA_DATE = migrated_data["TIMESTAMP"].max()

LAST_FORM_DATA_DATE

In [None]:
form_df = load_data_pd(sheet_name="form_data", sheet_id=os.environ["SHEET_ID"])

form_df = form_df.drop(
    columns=[
        x
        for x in form_df.columns
        if x.__contains__("Unnamed") or x.__contains__("Score")
    ]
)

form_df["Timestamp"] = pd.to_datetime(form_df["Timestamp"]).dt.tz_localize("US/Eastern")

form_df.columns = [
    x.replace(" ", "_").replace("[", "").replace("]", "") if x.__contains__("[") else x
    for x in form_df.columns
]

In [None]:
new_df = (
    form_df[form_df["Timestamp"] > LAST_FORM_DATA_DATE].copy().reset_index(drop=True)
)

new_df

In [None]:
if new_df.shape[0] == 0:
    raise IndexError("No new records to update")

In [None]:
def transform_form_data(
    df: pd.DataFrame, game_type_replace_string: Literal["_2_", "_3_", "_4_"]
) -> pd.DataFrame:

    if df.shape[0] > 0:
        idf = df.copy()

        idf_out = idf.dropna(axis=1, how="all").reset_index(drop=True)

        idf_out.columns = [
            (
                x.replace(game_type_replace_string, "_")
                if x.__contains__(game_type_replace_string)
                else x.upper()
            )
            for x in idf_out.columns
        ]

        idf_out["TIMESTAMP"] = pd.to_datetime(idf_out["TIMESTAMP"])

        return idf_out
    else:
        return pd.DataFrame()

In [None]:
dfs = []

for game_type in [2, 3, 4]:
    tdf = new_df[new_df["PLAYERS"] == game_type].copy().reset_index(drop=True)

    dfs.append(tdf)

two_p = dfs[0]
three_p = dfs[1]
four_p = dfs[2]

two_p_out = transform_form_data(df=two_p, game_type_replace_string="_2_")
three_p_out = transform_form_data(df=three_p, game_type_replace_string="_3_")
four_p_out = transform_form_data(df=four_p, game_type_replace_string="_4_")

cat_df = (
    pd.concat([two_p_out, three_p_out, four_p_out])
    .sort_values(by="TIMESTAMP")
    .reset_index(drop=True)
    .reset_index()
)[
    [
        "TIMESTAMP",
        "NEW_SESSION",
        "MAP",
        "PLAYERS",
        "PLAYERS_1ST",
        "PLAYERS_2ND",
        "PLAYERS_3RD",
        "PLAYERS_4TH",
        "CHARACTERS_1ST",
        "CHARACTERS_2ND",
        "CHARACTERS_3RD",
        "CHARACTERS_4TH",
    ]
]

cat_df

In [None]:
season_initial = load_data_pd(
    sheet_name="data_main",
    sheet_id=os.environ["SHEET_ID"],
    usecols=[
        "DATE",
        "SEASON",
    ],
)

season_initial["DATE"] = pd.to_datetime(season_initial["DATE"]).dt.tz_localize(
    "US/Eastern"
)

season_gb = (
    season_initial[season_initial["DATE"] > LAST_FORM_DATA_DATE]
    .groupby(["DATE"])[["SEASON"]]
    .first()
    .reset_index()
    .rename(columns={"DATE": "TIMESTAMP"})
)

season_gb.tail()

In [None]:
season_gb

In [None]:
new_data_ready = pd.merge(
    cat_df, season_gb, on="TIMESTAMP", how="inner", validate="1:1"
)

# Double check that no records were dropped in the merge
assert new_data_ready.shape[0] == cat_df.shape[0]

In [None]:
cat_df

In [None]:
new_data_ready

In [None]:
migrated_data.tail()

In [None]:
new_data_ready.head()

In [None]:
data_concat = (
    pd.concat([migrated_data, new_data_ready])
    .sort_values(by="TIMESTAMP")
    .reset_index(drop=True)
)

data_concat

In [None]:
assert data_concat["TIMESTAMP"].is_unique
assert data_concat["TIMESTAMP"].is_monotonic_increasing

In [None]:
data_concat_filled = fill_new_session(
    df=data_concat, timestamp_column_name="TIMESTAMP", drop_window_start_column=False
)

# # Convert back to EST for Postgres
# data_concat_filled["TIMESTAMP"] = data_concat_filled["TIMESTAMP"].dt.tz_convert(
#     "US/Eastern"
# )

data_concat_filled

In [None]:
new_session_df = data_concat_filled.copy()

new_session_df["temp_session"] = np.where(new_session_df["NEW_SESSION"] == "YES", 1, 0)

new_session_df["SUID"] = new_session_df["temp_session"].cumsum()

In [None]:
new_session_df

In [None]:
temp_df = new_session_df.copy()

holder = []

for suid in temp_df["SUID"].unique():

    temp = temp_df[temp_df["SUID"] == suid].copy()

    temp_first_game = temp[temp["NEW_SESSION"] == "YES"].copy().reset_index(drop=True)

    temp["window_start"] = temp_first_game["window_start"][0]
    temp["window_end"] = (
        temp_first_game["window_start"][0]
        + pd.Timedelta(days=1)
        - pd.Timedelta(nanoseconds=1)
    )

    holder.append(temp)

temp_out = pd.concat(holder).sort_values(by=["TIMESTAMP"]).reset_index(drop=True)


temp_out.tail(20)

In [None]:
out_df = (
    temp_out[
        [
            "TIMESTAMP",
            "NEW_SESSION",
            "SUID",
            "MAP",
            "PLAYERS",
            "PLAYERS_1ST",
            "PLAYERS_2ND",
            "PLAYERS_3RD",
            "PLAYERS_4TH",
            "CHARACTERS_1ST",
            "CHARACTERS_2ND",
            "CHARACTERS_3RD",
            "CHARACTERS_4TH",
            "SEASON",
            "window_start",
            "window_end",
        ]
    ]
    .copy()
    .rename(
        columns={
            "window_start": "SUID_WINDOW_START",
            "window_end": "SUID_WINDOW_END",
        }
    )
)

out_df

In [None]:
out_df.tail(20)

In [None]:
new_records = out_df[out_df["TIMESTAMP"].isin(new_df["Timestamp"])].reset_index(drop=True)

new_records

In [None]:
out_df.to_csv(
    rf"C:\Users\Cooper\sandbox\mkstream\form_data_migration\form_data_valid.csv",
    index=False,
)

new_records.to_csv(
    rf"C:\Users\Cooper\sandbox\mkstream\form_data_migration\form_data_valid_new_records.csv",
    index=False,
)