In [1]:
%load_ext kedro.ipython
import pandas as pd
raw_data = catalog.load("dataset_kobe_dev")
raw_data.head()


Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [2]:
raw_data.isnull().sum()



action_type              [1;36m0[0m
combined_shot_type       [1;36m0[0m
game_event_id            [1;36m0[0m
game_id                  [1;36m0[0m
lat                      [1;36m0[0m
loc_x                    [1;36m0[0m
loc_y                    [1;36m0[0m
lon                      [1;36m0[0m
minutes_remaining        [1;36m0[0m
period                   [1;36m0[0m
playoffs                 [1;36m0[0m
season                   [1;36m0[0m
seconds_remaining        [1;36m0[0m
shot_distance            [1;36m0[0m
shot_made_flag        [1;36m3986[0m
shot_type                [1;36m0[0m
shot_zone_area           [1;36m0[0m
shot_zone_basic          [1;36m0[0m
shot_zone_range          [1;36m0[0m
team_id                  [1;36m0[0m
team_name                [1;36m0[0m
game_date                [1;36m0[0m
matchup                  [1;36m0[0m
opponent                 [1;36m0[0m
shot_id                  [1;36m0[0m
dtype: int64

In [3]:
df =raw_data.dropna(subset=["shot_made_flag"])

In [4]:
colunas = [
    "lat",
    "lon",  
    "minutes_remaining",
    "period",
    "playoffs",
    "shot_distance",
    "shot_made_flag"
]

df = raw_data[colunas]
df = df.dropna(subset=colunas)

df.head()
print("Dimensão final (linhas, colunas):", df.shape)
print(df.isnull().sum())


Dimensão final (linhas, colunas): (20285, 7)
lat                  0
lon                  0
minutes_remaining    0
period               0
playoffs             0
shot_distance        0
shot_made_flag       0
dtype: int64


In [5]:
print("Dimensão final:", df.shape)

Dimensão final: (20285, 7)


In [6]:
import mlflow
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

def pipeline_preparacao_dados(
    input_path_dev: str = "../data/01_raw/dataset_kobe_dev.parquet",
    input_path_prod: str = "../data/01_raw/dataset_kobe_prod.parquet",
    data_filtered_path: str = "../data/02_intermediate/data_filtered.parquet",
    train_path: str = "../data/processed/base_train.parquet",
    test_path: str = "../data/processed/base_test.parquet",
    test_size: float = 0.2,
    random_state: int = 42
):
    with mlflow.start_run(run_name="PreparacaoDados"):
        mlflow.set_tag("pipeline", "preparacaodados")


        # 1) Carregar dataset (dev)
        df_raw_dev = pd.read_parquet(input_path_dev)

        # 2) Selecionar colunas
        colunas = [
            "lat",
            "lon",  
            "minutes_remaining",
            "period",
            "playoffs",
            "shot_distance",
            "shot_made_flag"
        ]

        # 3) Filtrar colunas e remover nulos
        df_filtered = df_raw_dev[colunas].dropna(subset=colunas)

        # 4) Salvar dataset filtrado
        Path(data_filtered_path).parent.mkdir(parents=True, exist_ok=True)
        df_filtered.to_parquet(data_filtered_path, index=False)

        # 5) Logar métricas no MLflow
        mlflow.log_metric("linhas_filtradas", df_filtered.shape[0])
        mlflow.log_metric("colunas_filtradas", df_filtered.shape[1])

        # 6) Separar dados
        X = df_filtered.drop("shot_made_flag", axis=1)
        y = df_filtered["shot_made_flag"]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=test_size,
            random_state=random_state,
            stratify=y
        )

        df_train = X_train.copy()
        df_train["shot_made_flag"] = y_train

        df_test = X_test.copy()
        df_test["shot_made_flag"] = y_test

        # 7) Salvar arquivos
        Path(train_path).parent.mkdir(parents=True, exist_ok=True)
        Path(test_path).parent.mkdir(parents=True, exist_ok=True)

        df_train.to_parquet(train_path, index=False)
        df_test.to_parquet(test_path, index=False)

        # 8) Logar parâmetros e métricas
        mlflow.log_param("test_size", test_size)
        mlflow.log_metric("train_rows", df_train.shape[0])
        mlflow.log_metric("test_rows", df_test.shape[0])

        proporcao = df_train["shot_made_flag"].value_counts(normalize=True).to_dict()
        mlflow.log_metric("prop_0_train", proporcao.get(0.0, 0))
        mlflow.log_metric("prop_1_train", proporcao.get(1.0, 0))

        # 9) Print informativo
        print("[INFO] Pipeline PreparacaoDados finalizado.")
        print(" - data_filtered:", df_filtered.shape)
        print(" - base_train:", df_train.shape, "->", train_path)
        print(" - base_test :", df_test.shape,  "->", test_path)
        print("✔️  Dados preparados com sucesso e registrados no MLflow.")


# Rodar manualmente no notebook (ajustado ao local do .ipynb)
pipeline_preparacao_dados(
    input_path_dev="../data/01_raw/dataset_kobe_dev.parquet",
    input_path_prod="../data/01_raw/dataset_kobe_prod.parquet",
    data_filtered_path="../data/02_intermediate/data_filtered.parquet",
    train_path="../data/processed/base_train.parquet",
    test_path="../data/processed/base_test.parquet",
    test_size=0.2
)


[INFO] Pipeline PreparacaoDados finalizado.
 - data_filtered: (20285, 7)
 - base_train: (16228, 7) -> ../data/processed/base_train.parquet
 - base_test : (4057, 7) -> ../data/processed/base_test.parquet
✔️  Dados preparados com sucesso e registrados no MLflow.
