In [1]:
import pandas as pd 
import numpy as np 
import xgboost as xgb
import mlflow
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split



In [2]:
# Import the data
df = pd.read_csv("../data/training_data/training_data.csv")
df.head()

df = df.astype(np.float32)
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.33,
                                                    random_state=4284, stratify=Y)
mlflow.set_experiment("Baseline_Predictions")
mlflow.xgboost.autolog()
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [3]:
with mlflow.start_run(run_name='xgboost_model_baseline') as run:
    model = xgb.train(dtrain=dtrain, params={})
    preds = model.predict(dtest)
    y_bin = np.where(preds>0.5, 1,0)
    f1 = f1_score(y_test, y_bin)
    mlflow.log_metric(key="f1_experiment_score", value=f1)