In [1]:
import os 
import mlflow as mf 
import joblib
import torch
import transformers
import numpy as np
from tqdm import tqdm

import ftzard.utils.mlflow as mf_utils


from hydra import initialize, compose
from warnings import filterwarnings

import dagstermill as dgm
filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_path = '../..'
config_path = f'../../config/'
config_name = 'config'

In [3]:
with initialize(version_base=None, config_path=config_path):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW_TRACKING_URI, cfg.MLFLOW_EXPERIMENT_NAME
    mlflow_model_name = cfg.MLFLOW_MODEL_NAME 
    

In [5]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
run_name = 'sampling'
data_path = f"{base_path}/data/predictions.joblib"
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)
print('Path to Data: ', data_path)

Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  sampling
Path to Data:  ../../data/predictions.joblib


In [6]:
predictions_data = joblib.load(data_path)

In [7]:
print(predictions_data.keys())

dict_keys(['data', 'predicted_labels', 'logits'])


In [13]:
logits = predictions_data["logits"]
logits = np.array(logits)
print(logits.shape, logits[0])

(480, 2) [-2.002 -1.101]


In [21]:
def softmax(array):
    exponents, result  = np.zeros(array.shape), np.zeros(array.shape)
    for index in range(len(array)):
        exponents[index] = np.exp(array[index])
    for index in range(len(array)):
        result[index] = exponents[index]/np.sum(exponents)

    return result

def least_confidence_sampling(array):
    return (1 - np.max(array))/(len(array)/(len(array)-1))
    
        

softmax_logits = np.array([softmax(elem) for elem in logits])
print('Softmax Logits: \n', softmax_logits[:3])


uncertainity_scores = np.array([least_confidence_sampling(item) for item in softmax_logits])
print('Uncertainity Scores: \n', uncertainity_scores[:3])

Softmax Logits: 
 [[0.28880772 0.71119228]
 [0.90685284 0.09314716]
 [0.98351495 0.01648505]]
Uncertainity Scores: 
 [0.14440386 0.04657358 0.00824253]


In [29]:
range_of_uncertainity = [np.min(uncertainity_scores), np.max(uncertainity_scores)]
print('Range of Uncertainity Scores: ', range_of_uncertainity)
print('Maximum Uncertainity Score', 0.5)

Range of Uncertainity Scores:  [0.00014256779551441445, 0.24848677922905382]
Maximum Uncertainity Score 0.5


In [33]:
top_uncertain_indices = np.where(uncertainity_scores>=0.1)

In [40]:
top_uncertain_data = predictions_data["data"].select(top_uncertain_indices[0])
print(top_uncertain_data)

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 86
})


In [41]:
dgm.yield_result(top_uncertain_data, output_name="retraining_data")

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 86
})