# Model Training

## Reading Data

In [2]:
import pandas as pd

In [6]:
IRIS_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
col_names = [
    'sepal_length',
    'sepal_width',
    'petal_length',
    'petal_width',
    'iris_class'
]

df = pd.read_csv(IRIS_URL, names=col_names)

In [7]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,iris_class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Model Training and Evaluation

* Here are the steps for model training and evaluation
    * Creation of features and label datasets
    * XGBoost is the chosen model because not only it is generally a good performing model, but it deals with _null_ feature values, so that I don't need to worry about a not present feature value.
    * 5 fold cross validation for a better performance estimation in a production environment
    * ACCURACY value reported

In [13]:
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [9]:
features = ['sepal_length','sepal_width','petal_length','petal_width']
label = ['iris_class']
X = df[features]
y = df[label]

In [12]:
model = xgboost.XGBClassifier(n_estimators=500, max_depth=3)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(model, X, y, cv=kfold)
print(results)
print("ACCURACY: %.2f (%.2f)" % (results.mean(), results.std()))

[1.         0.96666667 0.93333333 0.93333333 0.93333333]
ACCURACY: 0.95 (0.03)


# Model training for production

* For production, I used all data to train the final model
* Model is uploaded to a Google Cloud Storage bucket so that it can be fetched in the production environment. This is a simple solution, in the real world we would need a model versioning framework (such as MLFlow)
* This model is a toy example from a toy dataset, creating a model from a real problem would take much more effort and care

In [15]:
X_array = X.values
y_array = y.values

model = xgboost.XGBClassifier(n_estimators=500, max_depth=3)
model.fit(X_array, y_array)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
import os
import joblib
import tempfile
from google.cloud import storage

client = storage.Client(project='via-varejo-mlops')
bucket = client.get_bucket('iris-model-artifacts')



In [21]:
with tempfile.TemporaryDirectory() as tmpdirname:
    model_path = os.path.join(tmpdirname,'model.joblib')
    joblib.dump(model, model_path) 
    
    blob = bucket.blob('model.joblib')
    blob.upload_from_filename(model_path)  

    dataset_path = os.path.join(tmpdirname,'model_dataset.csv')
    dataset = df.to_csv(dataset_path, index=False)
    
    blob = bucket.blob('model_dataset.csv')
    blob.upload_from_filename(dataset_path)

In [22]:
blob = bucket.blob('model.joblib')
blob.download_to_filename('model.joblib')
model = joblib.load('model.joblib')
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [23]:
import numpy as np
example = np.array([5.1,3.5,1.4,0.2]).reshape(1, -1)

In [24]:
model.predict(example)

array(['Iris-setosa'], dtype=object)