The data comes from Kaggle:
- https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists?select=sample_submission.csv

In [1]:
%matplotlib inline
%config Completer.use_jedi=False

In [2]:
from urllib.parse import urlparse

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, classification_report)
import mlflow
import mlflow.sklearn

In [4]:
mlflow.set_tracking_uri("file:///Users/ahmed.besbes/projects/mlflow/mlruns")

In [5]:
tracking_uri = mlflow.get_tracking_uri()

In [6]:
tracking_uri

'file:///Users/ahmed.besbes/projects/mlflow/mlruns'

In [7]:
data = pd.read_csv("./../data/aug_train.csv")
targets = data[["target"]]
data.drop(["enrollee_id", "target"], inplace=True, axis=1)

In [8]:
categorical_features = []
numerical_features = []

for column in data.columns:
    dtype = str(data[column].dtype)
    if dtype in ["float64", "int64"]:
        numerical_features.append(column)
    else:
        categorical_features.append(column)

In [9]:
for categorical_feature in categorical_features:
    data[categorical_feature].fillna('missing', inplace=True)

In [10]:
for categorical_feature in categorical_features:
    le = LabelEncoder()
    data[categorical_feature] = le.fit_transform(data[categorical_feature])

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data.values, 
                                                    targets.values.ravel(), 
                                                    test_size=0.3, 
                                                    random_state=2021,
                                                    stratify=targets.values)

In [12]:
print(x_train.shape, x_test.shape)

(13410, 12) (5748, 12)


In [13]:
print(y_train.shape, y_test.shape)

(13410,) (5748,)


In [18]:
with mlflow.start_run():
    class_weight = "balanced"
    max_iter = 1000

    logistic_regression = LogisticRegression(class_weight=class_weight, max_iter=max_iter)
    logistic_regression.fit(x_train, y_train)

    y_pred = logistic_regression.predict(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    mlflow.log_param("class_weight", class_weight)
    mlflow.log_param("max_iter", max_iter)
    
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("auc", auc) 
    
    mlflow.sklearn.log_model(logistic_regression, "model")