In [None]:
import polars as pl

In [None]:
journal_data = pl.read_csv('./journal_entries.csv', separator='|')

In [None]:
journal_data.head()

doc_id,date,year,month,day,hour,sentence,text
str,str,i64,i64,i64,i64,i64,str
"""2024-12-02_16-13-13-235.txt""","""2024-12-02""",2024,12,2,16,1,"""hdidhshjdjejdjbdb"""
"""2024-12-02_19-20-00-554.txt""","""2024-12-02""",2024,12,2,19,1,"""enteyxhdjbdb"""
"""2024-12-02_21-18-56-935.txt""","""2024-12-02""",2024,12,2,21,1,"""This is a part of the process"""
"""2024-12-02_16-05-40-145.txt""","""2024-12-02""",2024,12,2,16,1,"""This is an entry"""
"""2024-12-04_11-23-48-808.txt""","""2024-12-04""",2024,12,4,11,1,"""hello my name is mickey and i …"


In [None]:
sentences = journal_data['text'].to_list()

In [None]:
sentences

['hdidhshjdjejdjbdb',
 'enteyxhdjbdb',
 'This is a part of the process',
 'This is an entry',
 'hello my name is mickey and i am a junjor at byu idaho.',
 'i like baseball and am studying data science']

In [None]:
from tokenizers import Tokenizer
import onnxruntime as ort

from os import cpu_count
import numpy as np  # only used for the postprocessing sigmoid

# sentences = ["hello world"]  # for example a batch of 1

# labels as (ordered) list - from the go_emotions dataset
labels = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

tokenizer = Tokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")

# Optional - set pad to only pad to longest in batch, not a fixed length.
# (without this, the model will run slower, esp for shorter input strings)
params = {**tokenizer.padding, "length": None}
tokenizer.enable_padding(**params)

tokens_obj = tokenizer.encode_batch(sentences)

def load_onnx_model(model_filepath):
    _options = ort.SessionOptions()
    _options.inter_op_num_threads, _options.intra_op_num_threads = cpu_count(), cpu_count()
    _providers = ["CPUExecutionProvider"]  # could use ort.get_available_providers()
    return ort.InferenceSession(path_or_bytes=model_filepath, sess_options=_options, providers=_providers)

model = load_onnx_model("model_quantized.onnx")
output_names = [model.get_outputs()[0].name]  # E.g. ["logits"]

input_feed_dict = {
  "input_ids": [t.ids for t in tokens_obj],
  "attention_mask": [t.attention_mask for t in tokens_obj]
}

logits = model.run(output_names=output_names, input_feed=input_feed_dict)[0]
# produces a numpy array, one row per input item, one col per label

def sigmoid(x):
  return 1.0 / (1.0 + np.exp(-x))

# Post-processing. Gets the scores per label in range.
# Auto done by Transformers' pipeline, but we must do it manually with ORT.
model_outputs = sigmoid(logits)

In [None]:
# create a dataframe 
df = pl.DataFrame()

# for example, just to show the top result per input item
for probas in model_outputs:
    top_result_index = np.argmax(probas)
    print(labels[top_result_index], "with score:", probas[top_result_index])
    #sorted_indices = np.argsort(probas)[::-1]

    # Get the corresponding values from labels and probas lists
    #ranked_labels = [labels[i] for i in sorted_indices]
    #ranked_probas = [probas[i] for i in sorted_indices]

    # Create a dictionary from labels and probas
    data = dict(zip(labels, probas))

    # Add the data to the DataFrame as a new row
    df = df.vstack(pl.DataFrame(data, orient="row"))

df

neutral with score: 0.96565753
neutral with score: 0.96914005
neutral with score: 0.9256493
neutral with score: 0.96345854
neutral with score: 0.95053893
love with score: 0.6601694


admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0.003308,0.00291,0.002896,0.00657,0.00994,0.000931,0.003478,0.001826,0.001181,0.003667,0.00462,0.002436,0.000888,0.003222,0.001872,0.001216,0.000543,0.00296,0.001578,0.000491,0.001664,0.000415,0.00517,0.000402,0.000411,0.002639,0.001372,0.965658
0.002994,0.003008,0.003131,0.006328,0.009553,0.001157,0.003454,0.002184,0.001506,0.003482,0.004399,0.002541,0.000929,0.003374,0.002067,0.001211,0.000606,0.002916,0.001662,0.000509,0.001842,0.00042,0.005113,0.000404,0.000472,0.002871,0.001424,0.96914
0.002502,0.000854,0.00082,0.004011,0.065212,0.000778,0.005131,0.001327,0.000709,0.002242,0.005724,0.000877,0.0003,0.000876,0.000533,0.000786,0.000193,0.000947,0.000993,0.000207,0.003607,0.000248,0.021289,0.000359,0.000252,0.000902,0.000579,0.925649
0.001976,0.002021,0.003018,0.008913,0.011928,0.000828,0.002931,0.001989,0.000873,0.002908,0.003517,0.001853,0.00068,0.002035,0.001442,0.00084,0.000362,0.001517,0.000676,0.000363,0.001671,0.000272,0.00654,0.00029,0.000278,0.001885,0.001193,0.963459
0.002784,0.00279,0.001587,0.004307,0.02293,0.00112,0.002148,0.001433,0.001131,0.002151,0.0019,0.002066,0.001111,0.005526,0.002157,0.002238,0.00059,0.004232,0.001569,0.000555,0.002145,0.000731,0.011486,0.00074,0.000627,0.002163,0.001958,0.950539
0.098563,0.002603,0.001339,0.002768,0.236382,0.001338,0.00351,0.003609,0.001745,0.001577,0.002995,0.001066,0.000318,0.012894,0.000473,0.001049,0.000186,0.018918,0.660169,0.000275,0.002965,0.000777,0.009687,0.000687,0.000365,0.000815,0.001429,0.076964


In [None]:
df_combined = pl.concat([journal_data, df], how="horizontal")

df_combined

doc_id,date,year,month,day,hour,sentence,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
str,str,i64,i64,i64,i64,i64,str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""2024-12-02_16-13-13-235.txt""","""2024-12-02""",2024,12,2,16,1,"""hdidhshjdjejdjbdb""",0.003308,0.00291,0.002896,0.00657,0.00994,0.000931,0.003478,0.001826,0.001181,0.003667,0.00462,0.002436,0.000888,0.003222,0.001872,0.001216,0.000543,0.00296,0.001578,0.000491,0.001664,0.000415,0.00517,0.000402,0.000411,0.002639,0.001372,0.965658
"""2024-12-02_19-20-00-554.txt""","""2024-12-02""",2024,12,2,19,1,"""enteyxhdjbdb""",0.002994,0.003008,0.003131,0.006328,0.009553,0.001157,0.003454,0.002184,0.001506,0.003482,0.004399,0.002541,0.000929,0.003374,0.002067,0.001211,0.000606,0.002916,0.001662,0.000509,0.001842,0.00042,0.005113,0.000404,0.000472,0.002871,0.001424,0.96914
"""2024-12-02_21-18-56-935.txt""","""2024-12-02""",2024,12,2,21,1,"""This is a part of the process""",0.002502,0.000854,0.00082,0.004011,0.065212,0.000778,0.005131,0.001327,0.000709,0.002242,0.005724,0.000877,0.0003,0.000876,0.000533,0.000786,0.000193,0.000947,0.000993,0.000207,0.003607,0.000248,0.021289,0.000359,0.000252,0.000902,0.000579,0.925649
"""2024-12-02_16-05-40-145.txt""","""2024-12-02""",2024,12,2,16,1,"""This is an entry""",0.001976,0.002021,0.003018,0.008913,0.011928,0.000828,0.002931,0.001989,0.000873,0.002908,0.003517,0.001853,0.00068,0.002035,0.001442,0.00084,0.000362,0.001517,0.000676,0.000363,0.001671,0.000272,0.00654,0.00029,0.000278,0.001885,0.001193,0.963459
"""2024-12-04_11-23-48-808.txt""","""2024-12-04""",2024,12,4,11,1,"""hello my name is mickey and i …",0.002784,0.00279,0.001587,0.004307,0.02293,0.00112,0.002148,0.001433,0.001131,0.002151,0.0019,0.002066,0.001111,0.005526,0.002157,0.002238,0.00059,0.004232,0.001569,0.000555,0.002145,0.000731,0.011486,0.00074,0.000627,0.002163,0.001958,0.950539
"""2024-12-04_11-23-48-808.txt""","""2024-12-04""",2024,12,4,11,2,"""i like baseball and am studyin…",0.098563,0.002603,0.001339,0.002768,0.236382,0.001338,0.00351,0.003609,0.001745,0.001577,0.002995,0.001066,0.000318,0.012894,0.000473,0.001049,0.000186,0.018918,0.660169,0.000275,0.002965,0.000777,0.009687,0.000687,0.000365,0.000815,0.001429,0.076964


In [None]:
df_combined.write_csv('journal_emotions.csv', include_bom=False, separator='|')