In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import mlflow

In [2]:
work_dir = Path.home() / 'Programming/Python/machine-learning-exercises/higher-education-students-performance-evaluation'
data_file = work_dir / 'data/students-performance.csv'
attribute_names_json_file = work_dir / 'attribute_names.json'

In [3]:
with open(attribute_names_json_file, 'rt') as f_in:
    attribute_names_json = json.load(f_in)

In [4]:
sp_df = pd.read_csv(data_file)
cat_attribs = sp_df.columns[1:-1]

In [5]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('higher-education-students-performance-evaluation');

In [6]:
X = sp_df.drop(['STUDENT ID', 'GRADE'], axis=1)
y = sp_df['GRADE'].copy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=33)

In [8]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [9]:
cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
preprocessing = ColumnTransformer([("cat", cat_pipeline, cat_attribs)])

In [10]:
transformed_X_train = preprocessing.fit_transform(X_train)
transformed_X_test = preprocessing.transform(X_test)

In [11]:
# dtc = DecisionTreeClassifier()
# rfc = RandomForestClassifier()
# sgc = SGDClassifier()
# svc = SVC()
# xbgc = XGBClassifier()

In [12]:
with mlflow.start_run():

    mlflow.set_tag('developer', 'angelo')

    C = 0.01
    mlflow.log_param('C', C)

    log_reg = LogisticRegression(C=C)
    log_reg.fit(transformed_X_train, y_train)
    y_pred = log_reg.predict(transformed_X_test)

    acc = accuracy_score(y_pred, y_test)
    mlflow.log_metric('accuracy', acc)