In [116]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from sentence_transformers import SentenceTransformer, util
from math import sqrt, pow, exp
import numpy as np

In [117]:

DATA_COLUMN = 1
LABEL_COLUMN = 0
df_ORI = pd.read_csv("data/data_training_90.txt", sep="\t", header=None)
df_ORI.columns = [LABEL_COLUMN, DATA_COLUMN]
df_AUG = pd.read_csv("data/data_training_90_1_d_NO_RS(augonly).txt", sep="\t", header=None)
df_AUG.columns = [LABEL_COLUMN, DATA_COLUMN]

In [118]:
cos_model_name = "distiluse-base-multilingual-cased"

In [119]:
def jaccard_similarity(x,y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)
def squared_sum(x):
    """ return 3 rounded square rooted value """
    
    return round(sqrt(sum([a*a for a in x])),3)
 
def euclidean_distance(x,y):
    """ return euclidean distance between two lists """
    
    return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

def distance_to_similarity(distance):
    return 1/exp(distance)

def cos_similarity(x,y):
    """ return cosine similarity between two lists """    
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = squared_sum(x)*squared_sum(y)
    return round(numerator/float(denominator),3)

def create_embeddings (text, SentenceTransformer_model): 
    embeddings = SentenceTransformer_model.encode(text)
    if len(embeddings) !=0:
        return list(embeddings)
    else:
        return [0]
def calculate_bleu_scores(references, hypotheses):
    """
    Calculates BLEU 1-4 scores based on NLTK functionality

    Args:
        references: List of reference sentences
        hypotheses: List of generated sentences

    Returns:
        bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores

    """
    #return len(references), len(hypotheses)
    bleu_1 = np.round(corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2)
    bleu_2 = np.round(corpus_bleu(references, hypotheses, weights=(0.50, 0.50, 0., 0.)), decimals=2)
    bleu_3 = np.round(corpus_bleu(references, hypotheses, weights=(0.34, 0.33, 0.33, 0.)), decimals=2)
    bleu_4 = np.round(corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)), decimals=2)
    return bleu_1, bleu_2, bleu_3, bleu_4 

In [120]:
SentenceTransformer_model = SentenceTransformer(cos_model_name)
df           = pd.DataFrame()
newDF        = pd.DataFrame()
df           = df_ORI
#df.reset_index(inplace=True)
cntr         = 1

for index, row in df.iterrows():
    print('Rows Counter: ' + str(cntr))
    text     = row[DATA_COLUMN]
    all_text = df_AUG.iloc[index][DATA_COLUMN]
    embd1 = create_embeddings(text=text, SentenceTransformer_model=SentenceTransformer_model)
    embd2 = create_embeddings(text=all_text, SentenceTransformer_model=SentenceTransformer_model)
    new_embd1 = ','.join(str(x) for x in embd1)
    new_embd2 = ','.join(str(x) for x in embd2)
    esim = euclidean_distance(embd1, embd2)
    csim = cos_similarity(embd1, embd2)
    jsim = jaccard_similarity(text, all_text)    
    tmp = { 
        'text': [row[DATA_COLUMN]], 
        'label': [row[LABEL_COLUMN]], 
        'all_text': [all_text], 
        'original_embedding': [new_embd1], 
        'new_embedding': [new_embd2], 
        'ecu_similarity': [esim], 
        'cos_similarity': [csim], 
        'jacc_similarity': [jsim] 
    }
    tmpDF = pd.DataFrame(tmp)
#     print(tmpDF)
    newDF = pd.concat([newDF, tmpDF], ignore_index=True)
    
    cntr = cntr + 1
    #if cntr == 10:
#     #    break;
        
#     # GPU Monitor
#     if index%10 == 0:
#         GPUs = GPU.getGPUs()  
#         gpu = GPUs[0]    
#         print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))        
#         torch.cuda.empty_cache()


Rows Counter: 1
Rows Counter: 2
Rows Counter: 3
Rows Counter: 4
Rows Counter: 5
Rows Counter: 6
Rows Counter: 7
Rows Counter: 8
Rows Counter: 9
Rows Counter: 10
Rows Counter: 11
Rows Counter: 12
Rows Counter: 13
Rows Counter: 14
Rows Counter: 15
Rows Counter: 16
Rows Counter: 17
Rows Counter: 18
Rows Counter: 19
Rows Counter: 20
Rows Counter: 21
Rows Counter: 22
Rows Counter: 23
Rows Counter: 24
Rows Counter: 25
Rows Counter: 26
Rows Counter: 27
Rows Counter: 28
Rows Counter: 29
Rows Counter: 30
Rows Counter: 31
Rows Counter: 32
Rows Counter: 33
Rows Counter: 34
Rows Counter: 35
Rows Counter: 36
Rows Counter: 37
Rows Counter: 38
Rows Counter: 39
Rows Counter: 40
Rows Counter: 41
Rows Counter: 42
Rows Counter: 43
Rows Counter: 44
Rows Counter: 45
Rows Counter: 46
Rows Counter: 47
Rows Counter: 48
Rows Counter: 49
Rows Counter: 50
Rows Counter: 51
Rows Counter: 52
Rows Counter: 53
Rows Counter: 54
Rows Counter: 55
Rows Counter: 56
Rows Counter: 57
Rows Counter: 58
Rows Counter: 59
Rows C

In [121]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
embeddings = model.encode(sentences)
print(embeddings)

[[-0.03885621  0.01854846 -0.04066142 ...  0.01009197 -0.01660532
  -0.00138948]
 [-0.00059496 -0.00924198 -0.05870509 ...  0.0163878   0.0150957
  -0.04368323]]


In [122]:
newDF['text_split'] = [list(x.split()) for x in newDF['text']]
newDF['all_text_split'] = [x.split() for x in newDF['all_text']]
# df['new_text_split'] = [str(x).split() for x in df['new_text']]

In [123]:
newDF[['bleu_sim_1','bleu_sim_2','bleu_sim_3','bleu_sim_4']] = [ calculate_bleu_scores ([[x]],[y]) for x, y in zip(newDF['text_split'], newDF['all_text_split'])]

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [124]:
newDF.head()

Unnamed: 0,text,label,all_text,original_embedding,new_embedding,ecu_similarity,cos_similarity,jacc_similarity,text_split,all_text_split,bleu_sim_1,bleu_sim_2,bleu_sim_3,bleu_sim_4
0,lucu kak abbey cantik kayak orang haru,1,lucu kak abbey cantik adiwarna kayak orang adi...,"-0.008952069,0.040075563,0.009347845,-0.011252...","-0.0136766415,0.041937623,0.0019431965,-0.0123...",0.16442,0.951,0.888889,"[lucu, kak, abbey, cantik, kayak, orang, haru]","[lucu, kak, abbey, cantik, adiwarna, kayak, or...",0.78,0.62,0.48,0.37
1,heran deh ngatain baperan wajar dese baper udi...,0,terpesona deh ngatain baperan wajar dese baper...,"-0.0025755772,0.02613186,0.007338423,-0.015779...","-0.0056755748,0.031803522,6.8956055e-05,-0.015...",0.259165,0.88,0.958333,"[heran, deh, ngatain, baperan, wajar, dese, ba...","[terpesona, deh, ngatain, baperan, wajar, dese...",0.68,0.56,0.48,0.39
2,jujur ni org bagus kaga oplasada tu foto sblm ...,0,jujur ni org elok kaga oplasada tu potret sblm...,"-0.035448533,-0.002518586,0.0033979444,-0.0242...","-0.026780076,0.0033431114,0.036886457,-0.01977...",0.321123,0.762,0.952381,"[jujur, ni, org, bagus, kaga, oplasada, tu, fo...","[jujur, ni, org, elok, kaga, oplasada, tu, pot...",0.73,0.56,0.42,0.0
3,haduuh orang gak akhlak bangeg kerja antem mul...,0,adab haduuh orang adab gak akhlak bangeg kerja...,"0.0004594625,0.0074383444,0.011340768,-0.00337...","-0.00016716315,0.003545129,0.0072791334,-0.003...",0.137265,0.945,1.0,"[haduuh, orang, gak, akhlak, bangeg, kerja, an...","[adab, haduuh, orang, adab, gak, akhlak, bange...",0.82,0.76,0.68,0.63
4,bingung jujur media rebut ngeliput orang kayak...,0,waras jujur media kayak ngeliput orang rebut c...,"-0.030819891,0.03667834,-0.022147859,-0.026122...","-0.028461238,0.024706094,-0.03251241,-0.031904...",0.193797,0.967,1.0,"[bingung, jujur, media, rebut, ngeliput, orang...","[waras, jujur, media, kayak, ngeliput, orang, ...",1.0,0.52,0.0,0.0


In [125]:
newDF.to_csv("data/similarity_data_training_90_1_d_NO_RS.txt", sep="\t", header=0,encoding='utf-16', index=False)

In [126]:
SIM_COFFICIENTS_THRESHOLDS = {'ECU': newDF["ecu_similarity"].mean(), 'COS':newDF["cos_similarity"].mean(), 'JAC':newDF["jacc_similarity"].mean(), 'BLEU':newDF["bleu_sim_1"].mean()}
SIM_COFFICIENTS_THRESHOLDS

{'ECU': 0.2458361729956375,
 'COS': 0.8945,
 'JAC': 0.9385908119551419,
 'BLEU': 0.8154722222222224}