In [2]:
import pandas as pd 
import numpy as np 
import mlflow
from sklearn.metrics import f1_score, confusion_matrix 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=0, stratify=Y)
mx = MinMaxScaler()

X_train = mx.fit_transform(X_train)
X_test = mx.transform(X_test)

In [7]:
mlflow.set_experiment("Baseline predictions")
mlflow.xgboost.autolog()
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [8]:
with mlflow.start_run(run_name="xgboost_baseline") as run:
    model = xgb.train(dtrain=dtrain, params={})
    preds= model.predict(dtest)
    y_pred = np.where(preds>0.5, 1, 0)
    f1 = f1_score(y_test, y_pred)
    mlflow.log_metric(key="f1_experiement_score", value=f1)
    mlflow.log_artifact("xgboost.ipynb")
    