In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu
from math import sqrt, pow, exp
import numpy as np

In [2]:
cos_model_name = "distiluse-base-multilingual-cased"
def jaccard_similarity(x,y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)
def squared_sum(x):
    """ return 3 rounded square rooted value """
    
    return round(sqrt(sum([a*a for a in x])),3)
 
def euclidean_distance(x,y):
    """ return euclidean distance between two lists """
    
    return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

def distance_to_similarity(distance):
    return 1/exp(distance)

def cos_similarity(x,y):
    """ return cosine similarity between two lists """    
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = squared_sum(x)*squared_sum(y)
    return round(numerator/float(denominator),3)

def create_embeddings (text, SentenceTransformer_model): 
    embeddings = SentenceTransformer_model.encode(list(text))
    if len(embeddings) !=0:
        return list(embeddings[0])
    else:
        return [0]
# def calculate_bleu_scores(references, hypotheses):
#     """
#     Calculates BLEU 1-4 scores based on NLTK functionality

#     Args:
#         references: List of reference sentences
#         hypotheses: List of generated sentences

#     Returns:
#         bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores

#     """
#     #return len(references), len(hypotheses)
#     bleu_1 = np.round(corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2)
#     return bleu_1

def calculate_bleu_scores(reference, hypothesis):
    """
    Calculate BLEU-1 score between two sentences (string inputs).
    """
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    return np.round(
        corpus_bleu([[ref_tokens]], [hyp_tokens], weights=(1.0, 0., 0., 0.)),
        2
    )

In [3]:
# === Load data ===
df = pd.read_csv("data/data_replikasi/data_training_90_16_f_NO_RS.txt", sep="\t", encoding='utf-8', header=None)
df.columns = ["label", "text"]

num_aug = 15  # change as needed
SentenceTransformer_model = SentenceTransformer("distiluse-base-multilingual-cased")

# === Loop ===
newDF = pd.DataFrame()

# Start from index = num_aug so that we can look back for augments
for i in range(num_aug, len(df), num_aug + 1):
    # Original line is here
    original_label = df.iloc[i]["label"]
    original_text = df.iloc[i]["text"]
    embd1 = create_embeddings(original_text, SentenceTransformer_model)

    # Augmented lines are the previous num_aug rows
    for j in range(1, num_aug + 1):
        aug_index = i - j
        if aug_index < 0:
            break

        aug_text = df.iloc[aug_index]["text"]
        embd2 = create_embeddings(aug_text, SentenceTransformer_model)

        # Similarities
        esim = euclidean_distance(embd1, embd2)
        csim = cos_similarity(embd1, embd2)
        jsim = jaccard_similarity(original_text, aug_text)
        bleu = calculate_bleu_scores(original_text, aug_text)

        tmpDF = pd.DataFrame({
            "text": [original_text],
            "label": [original_label],
            "all_text": [aug_text],
            "original_embedding": [",".join(map(str, embd1))],
            "new_embedding": [",".join(map(str, embd2))],
            "ecu_similarity": [esim],
            "cos_similarity": [csim],
            "jacc_similarity": [jsim],
            "bleu_similarity": [bleu]
        })

        newDF = pd.concat([newDF, tmpDF], ignore_index=True)

# === Save ===
newDF.to_csv("data/data_baru/data_training_90_16_f_NO_RS.txt", sep="\t", header=False, index=False)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
