# Feast + MLFLOW

In [26]:
import pandas as pd
from feast import FeatureStore
import mlflow
import dagshub

import random
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


from sklearn.metrics import accuracy_score, f1_score

dagshub.init(repo_owner='abdala9512',
             repo_name='dsrp-machine-learning-engineering-2',
             mlflow=True)

In [2]:
fs = FeatureStore("../feast_service/fs_dsrp_mle2_jul9/feature_repo/")

In [24]:
entity_df = pd.DataFrame.from_dict(
        {
            # entity's join key -> entity values
            "booking_id": pd.read_parquet("../feast_service/fs_dsrp_mle2_jul9/feature_repo/data/bookings_feature_table.parquet")["booking_id"][:1000].tolist(),
            "kpi1": [ np.random.normal(0) for i in range(1000)],
            "kpi2": [ np.random.normal(0) for i in range(1000)],
        },
    )
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)
entity_df


Unnamed: 0,booking_id,kpi1,kpi2,event_timestamp
0,00bb4fea-525b-4757-9f1d-540f131dd5c3,0.769260,-0.283067,2025-07-31 02:22:41.792477+00:00
1,1236fc59-d92c-4227-8fe8-41f464c5565c,0.278091,-0.195238,2025-07-31 02:22:41.792477+00:00
2,95f636b8-cbcd-411c-b1fb-4eecebb0e725,1.231688,0.234945,2025-07-31 02:22:41.792477+00:00
3,eaed376f-bc93-45aa-b173-3aaa028347c8,0.540526,1.898306,2025-07-31 02:22:41.792477+00:00
4,1629e178-d66f-4fef-a67b-327387aa2e1a,-0.722427,-2.942865,2025-07-31 02:22:41.792477+00:00
...,...,...,...,...
995,9bb8ed19-d187-4fcd-bdc1-b9a74a31ad95,-0.117430,-1.994649,2025-07-31 02:22:41.792477+00:00
996,2de7189e-3638-4961-b5e7-decabf9691a7,-0.319549,-0.754413,2025-07-31 02:22:41.792477+00:00
997,901bb81b-18af-4709-b39e-4157b0c44c0d,-1.359143,0.872779,2025-07-31 02:22:41.792477+00:00
998,0efa9a60-0a6f-4d5f-b6c3-2e006584c18c,0.331983,-1.046922,2025-07-31 02:22:41.792477+00:00


In [29]:
feature_table = fs.get_historical_features(
        entity_df=entity_df,
        features=[
            "pc_booking_view:great_feature1",
            "pc_booking_view:great_feature2",
            "great_feature_view:great_feature1_kpi1",
            "great_feature_view:great_feature2_kpi2",
        ],
).to_df()

feature_table["target"] = [random.choice([0,1]) for i in range(1000)]
feature_table

Unnamed: 0,booking_id,kpi1,kpi2,event_timestamp,great_feature1,great_feature2,great_feature1_kpi1,great_feature2_kpi2,target
0,00bb4fea-525b-4757-9f1d-540f131dd5c3,0.769260,-0.283067,2025-07-31 02:22:41.792477+00:00,0.510776,0.310417,0.392920,-0.087869,1
1,1236fc59-d92c-4227-8fe8-41f464c5565c,0.278091,-0.195238,2025-07-31 02:22:41.792477+00:00,0.454727,-0.159782,0.126456,0.031196,0
2,95f636b8-cbcd-411c-b1fb-4eecebb0e725,1.231688,0.234945,2025-07-31 02:22:41.792477+00:00,-0.900479,-0.999280,-1.109109,-0.234776,0
3,eaed376f-bc93-45aa-b173-3aaa028347c8,0.540526,1.898306,2025-07-31 02:22:41.792477+00:00,-0.431781,0.680112,-0.233389,1.291061,1
4,1629e178-d66f-4fef-a67b-327387aa2e1a,-0.722427,-2.942865,2025-07-31 02:22:41.792477+00:00,-0.131863,-0.149217,0.095261,0.439125,0
...,...,...,...,...,...,...,...,...,...
995,9b685c47-ac50-4e85-acd2-6492395e6b59,0.571517,-1.061881,2025-07-31 02:22:41.792477+00:00,0.100255,-0.217351,0.057297,0.230801,0
996,901bb81b-18af-4709-b39e-4157b0c44c0d,-1.359143,0.872779,2025-07-31 02:22:41.792477+00:00,0.405005,-0.239488,-0.550460,-0.209020,1
997,0efa9a60-0a6f-4d5f-b6c3-2e006584c18c,0.331983,-1.046922,2025-07-31 02:22:41.792477+00:00,-0.118608,0.099774,-0.039376,-0.104456,1
998,2de7189e-3638-4961-b5e7-decabf9691a7,-0.319549,-0.754413,2025-07-31 02:22:41.792477+00:00,-0.087687,0.155210,0.028020,-0.117092,1


## Preprocesamiento

In [30]:
X, y = feature_table.drop("target", axis=1), feature_table["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

## MLflow

In [14]:
mlflow.set_tracking_uri("https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow")

In [31]:
mlflow.create_experiment("DSRP - Booking Clase 30 Julio - Feast + MLFLOW  1000")

'5'

In [34]:
mlflow.set_experiment("DSRP - Booking Clase 30 Julio - Feast + MLFLOW  1000")

<Experiment: artifact_location='mlflow-artifacts:/cdf0a99c5c4a452a8b6c4dcbb936f8af', creation_time=1753928635369, experiment_id='5', last_update_time=1753928635369, lifecycle_stage='active', name='DSRP - Booking Clase 30 Julio - Feast + MLFLOW  1000', tags={}>

In [36]:
mlflow.autolog(log_models=True,)
with mlflow.start_run(run_name="Baseline - Dummy Classifier - Con MAS métricas") as run:

    algorithm = DummyClassifier()
    algorithm.fit(X_train, y_train)

    predictions = algorithm.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score,
            "metrica_dsrp": 100
        }   
    )

2025/07/30 21:29:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


🏃 View run Baseline - Dummy Classifier - Con MAS métricas at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/5/runs/be68aa8d06764033ade5c4383c002e28
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/5
