In [1]:
%load_ext autoreload
%autoreload 2

import itertools
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import yaml

# Config

In [3]:
%cd ..

/Users/zulikahlatief/Desktop/personal/Iterative-DVC-course/course-ds-base


In [4]:
with open("params.yaml") as config_file:
    config = yaml.safe_load(config_file)
    
print(config)

{'base': {'random_state': 42}, 'data': {'dataset_csv': 'data/raw/iris.csv', 'features_path': 'data/processed/featured_iris.csv'}, 'train': {'trainset_path': 'data/processed/train_iris.csv'}, 'test': {'test_size': 0.2, 'testset_path': 'data/processed/test_iris.csv'}, 'model': {'clf_params': {'C': 0.01, 'solver': 'lbfgs', 'multi_class': 'multinomial', 'max_iter': 10}, 'model_path': 'models/model.joblib'}, 'reports': {'metrics_file': 'reports/metrics.json', 'confusion_matrix_image': 'reports/confusion_matrix.png'}}


# Load dataset

In [6]:
from src.stages.data_load import data_load
data_load(config_path = "params.yaml")

Data load complete!


In [7]:
#CLI on JN
!python src/stages/data_load.py --config=params.yaml

Data load complete!


# Features engineering

In [None]:
dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']
dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']

dataset = dataset[[
    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',
    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',
    'target'
]]

In [None]:
dataset.head()

In [None]:
# Save features
dataset.to_csv(config["data"]["features_path"], index=False)

# Split dataset

In [None]:
random_state = config["base"]["random_state"]
test_size = config["test"]["test_size"]

train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=random_state)
train_dataset.shape, test_dataset.shape

In [None]:
# Save train and test sets
trainset_path = config["train"]["trainset_path"]
testset_path = config["test"]["testset_path"]

train_dataset.to_csv(trainset_path)
test_dataset.to_csv(testset_path)

# Train

In [None]:
# Get X and Y

y_train = train_dataset.loc[:, 'target'].values.astype('int32')
X_train = train_dataset.drop('target', axis=1).values.astype('float32')

In [None]:
# Create an instance of Logistic Regression Classifier CV and fit the data
logreg = LogisticRegression(**config["model"]["clf_params"], random_state=config["base"]["random_state"])
logreg.fit(X_train, y_train)

In [None]:
model_path= config["model"]["model_path"]
joblib.dump(logreg, model_path)

# Evaluate

In [None]:
from src.repot.visualization import plot_confusion_matrix

In [None]:
# Get X and Y

y_test = test_dataset.loc[:, 'target'].values.astype('int32')
X_test = test_dataset.drop('target', axis=1).values.astype('float32')

In [None]:
prediction = logreg.predict(X_test)
cm = confusion_matrix(prediction, y_test)
f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')

In [None]:
# f1 score value
f1

In [None]:
# Save metrics
metrics_file = config["reports"]["metrics_file"]

metrics = {
    'f1': f1
}

with open(metrics_file, 'w') as mf:
    json.dump(
        obj=metrics,
        fp=mf,
        indent=4
    )

In [None]:
cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)

In [None]:
# Save confusion matrix image
confusion_matrix_image = config["reports"]["confusion_matrix_image"]
cm_plot.savefig(confusion_matrix_image)