In [5]:
import pandas as pd
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from utils import get_sentiment
from loguru import logger

In [12]:
# TEST SENTIMENT MODELS OUT

model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
# other possibilities:
# "cardiffnlp/xlm-roberta-base-sentiment-multilingual"
# "MiMe-MeMo/MeMo-BERT-SA"
# "vesteinn/danish_sentiment"

# test getting sentiment (just normally w pipeline)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

text = "det er forfærdeligt"
sent = model(text)

xlm_label, xlm_score = sent[0].get("label"), sent[0].get("score")
print("default way, label:", xlm_label, "; confidence score:", xlm_score)

# see if our GET_SENTIMENT function works
# it should be the same as above, but converting the binary labels to a continuous value based on the score
sent_converted = get_sentiment(text, model, tokenizer, model_path)
print("continuous-converted values (w/ get_sentiment)", sent_converted)

if abs(sent_converted) == xlm_score or (sent_converted == 0 and "neut" in xlm_label.lower()):
    print("🔥")

Device set to use mps:0


default way, label: negative ; confidence score: 0.9767931699752808
continuous-converted values (w/ get_sentiment) -0.9767931699752808
🔥


In [None]:
# load the annotated dataset
ds = load_dataset("chcaa/fiction4sentiment")
df = pd.DataFrame(ds['train'])
df.head()

# TRY OUT GOOGLE TRANSLATE
from googletrans import Translator

# get danish sentences
df_dk = df[df['org_lang'] == 'dk']

# get the first 10 rows
df_dk = df_dk.head(10)

# make a translator object
translator = Translator()
# translate the text

for text in df_dk['text']:
    # translate the text
    translated = translator.translate(text, src='da', dest='en')
    # print the translated text
    print(f"Original: {text}")
    print(f"Transl: {translated.text}")
    print("-----")


Original: Langt ude i havet er vandet så blåt, som bladene på den dejligste kornblomst og så klart, som det reneste glas, men det er meget dybt, dybere end noget ankertov når, mange kirketårne måtte stilles oven på hinanden, for at række fra bunden op over vandet.
Transl: Far out in the sea, the water is as blue as the leaves of the most beautiful grain flower and as clearly as the cleanest glass, but it is very deep, deeper than something anchor rope when many church towers had to be placed on top of each other, to reach from the bottom up over the water.
-----
Original: Dernede bor havfolkene.
Transl: Then the sea people live.
-----
Original: Nu må man slet ikke tro, at der kun er den nøgne hvide sandbund; nej, der vokser de forunderligste træer og planter, som er så smidige i stilk og blade, at de ved den mindste bevægelse af vandet rører sig, ligesom om de var levende.
Transl: Now one must not believe that there is only the naked white sandy bottom;No, there are the most marvelous 

In [38]:
import numpy as np


def scale_converted_scores(score):
    """
    Uses tanh to compress extreme confidence and center the scale more evenly.
    """
    return np.round(np.tanh(1 * score),2)

In [40]:
test = [
    0.6,
    0.65,
    0.7,
    0.75,
    0.8,
    0.85,
    0.9,
    0.95,
    1.0
]

for t in test:
    print(f"tanh({t}) = {scale_converted_scores(t)}")
    print(f"tanh({-t}) = {scale_converted_scores(-t)}")
    print("-----")

tanh(0.6) = 0.54
tanh(-0.6) = -0.54
-----
tanh(0.65) = 0.57
tanh(-0.65) = -0.57
-----
tanh(0.7) = 0.6
tanh(-0.7) = -0.6
-----
tanh(0.75) = 0.64
tanh(-0.75) = -0.64
-----
tanh(0.8) = 0.66
tanh(-0.8) = -0.66
-----
tanh(0.85) = 0.69
tanh(-0.85) = -0.69
-----
tanh(0.9) = 0.72
tanh(-0.9) = -0.72
-----
tanh(0.95) = 0.74
tanh(-0.95) = -0.74
-----
tanh(1.0) = 0.76
tanh(-1.0) = -0.76
-----
