### Imports

In [1]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
import warnings
warnings.filterwarnings('ignore')


### Utils

In [2]:
def calculate_bleu_scores(references, hypotheses):
    """
    Calculates BLEU 1-4 scores based on NLTK functionality

    Args:
        references: List of reference sentences
        hypotheses: List of generated sentences

    Returns:
        bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores

    """
    #return len(references), len(hypotheses)
    bleu_1 = np.round(corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2)
    bleu_2 = np.round(corpus_bleu(references, hypotheses, weights=(0.50, 0.50, 0., 0.)), decimals=2)
    bleu_3 = np.round(corpus_bleu(references, hypotheses, weights=(0.34, 0.33, 0.33, 0.)), decimals=2)
    bleu_4 = np.round(corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)), decimals=2)
    return bleu_1, bleu_2, bleu_3, bleu_4 

# Functions
def check_label (label):
    for lbl in LABEL_TO_AUGMENT:
        if lbl.upper() == label.upper():
            return True
    return False        

def check_similarity_cofficient (given_value, label, current_sim_coff):
#     if not check_label(label):
#         return False
#     else:
    try:
        v = float(given_value)
        if float(SIM_COFFICIENTS_THRESHOLDS[current_sim_coff.upper()]) >= v:
            return True
        else:
            return False
    except:
        print('exception')

### Dataset Loading

In [3]:
datasetname = 'Cyberbullying_90_1_a_NO_RS'
# datasetpath = "datasets/xls/ArSarcasm-Unbalanced-Augmented-aragpt2-base.csv"
df = pd.read_csv( "data/data_baru/data_training_90_1_a_NO_RS.txt", sep="\t", encoding='utf-8')
# df.columns = ['text', 'label', 'new_text', 'all_text', 'original_embbedding', 'new_embbedding', 'ecu_similarity', 'cos_similarity', 'jacc_similarity','text_split', 'all_text_split', 'new_text_split', 'bleu_sim_1','bleu_sim_2', 'bleu_sim_3', 'bleu_sim_4']
df.columns = ['text', 'label', 'new_text', 'original_embbedding', 'new_embbedding', 'ecu_similarity', 'cos_similarity', 'jacc_similarity', 'bleu_sim_1']
df.head()

Unnamed: 0,text,label,new_text,original_embbedding,new_embbedding,ecu_similarity,cos_similarity,jacc_similarity,bleu_sim_1
0,heran deh ngatain baperan wajar dese baper udi...,0,heran deh ngatain baperan wajar dese baper udi...,"0.045313366,0.05557102,-0.031257693,-0.0365837...","0.045313366,0.05557102,-0.031257693,-0.0365837...",0.0,1.0,1.0,1.0
1,jujur ni org bagus kaga oplasada tu foto sblm ...,0,jujur ni org bagus kaga oplasada tu foto sblm ...,"0.0308086,0.045962002,-0.04430784,-0.024547419...","0.0308086,0.045962002,-0.04430784,-0.024547419...",0.0,0.999,0.954545,0.94
2,haduuh orang gak akhlak bangeg kerja antem mul...,0,haduuh orang gak budi bahasa bangeg kerja ante...,"0.045313366,0.05557102,-0.031257693,-0.0365837...","0.045313366,0.05557102,-0.031257693,-0.0365837...",0.0,1.0,0.944444,0.8
3,bingung jujur media rebut ngeliput orang kayak...,0,bingung jujur media jaring rebut ngeliput oran...,"0.036079526,0.019426107,-0.017474899,-0.007616...","0.036079526,0.019426107,-0.017474899,-0.007616...",0.0,1.0,1.0,0.92
4,ketemu artika sari disesemall jakartaampunnn a...,1,ketemu artika sari pis jakartaampunnn ayuneema...,"0.024090655,0.02943413,-0.026628973,-0.0180590...","0.024090655,0.02943413,-0.026628973,-0.0180590...",0.0,1.0,1.0,1.0


In [4]:
# df['label'].value_counts()

### Calculating Similarity Measures

In [5]:
# df['text_split'] = [list(x.split()) for x in df['text']]
# df['all_text_split'] = [x.split() for x in df['all_text']]
# df['new_text_split'] = [str(x).split() for x in df['new_text']]

In [6]:
#  df[['bleu_sim_1','bleu_sim_2','bleu_sim_3','bleu_sim_4']] = [ calculate_bleu_scores ([[x]],[y]) for x, y in zip(df['text_split'], df['all_text_split'])]


In [7]:
#  df[['bleu_sim_1','bleu_sim_1','bleu_sim_1','bleu_sim_1']]

In [8]:
df["ecu_similarity"].mean()

0.04750447044043844

In [9]:
df["cos_similarity"].mean()

0.9859220055710307

In [10]:
df["jacc_similarity"].mean()

0.9696083483833704

In [11]:
df["bleu_sim_1"].mean()

0.9328969359331477

In [12]:
# Parameters
all_datasets= []
SIM_COFFICIENTS_THRESHOLDS = {'ECU': df["ecu_similarity"].mean(), 'COS':df["cos_similarity"].mean(), 'JAC':df["jacc_similarity"].mean(), 'BLEU':df["bleu_sim_1"].mean()}
LABEL_TO_AUGMENT = ['positive', 'negative']
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

In [13]:
SIM_COFFICIENTS_THRESHOLDS

{'ECU': 0.04750447044043844,
 'COS': 0.9859220055710307,
 'JAC': 0.9696083483833704,
 'BLEU': 0.9328969359331477}

### Augmentation (All-Text)

In [14]:
# EcuDF = pd.DataFrame()
# CosDF = pd.DataFrame()
# JacDF = pd.DataFrame()
# BleDF = pd.DataFrame()
# cntr = 1

# print('All text augmentation is started... ')
# for index, row in df.iterrows():
#     tmpDF = pd.DataFrame({'text': [row[DATA_COLUMN]], 'label': [row[LABEL_COLUMN]]})
#     Ecu_value = row['ecu_similarity']
#     Cos_value = row['cos_similarity']
#     Jac_value = row['jacc_similarity']
#     Bleu_value = row['bleu_sim_1']

#     EcuDF = pd.concat([EcuDF, tmpDF], ignore_index=True)
#     CosDF = pd.concat([CosDF, tmpDF], ignore_index=True)
#     JacDF = pd.concat([JacDF, tmpDF], ignore_index=True)
#     BleDF = pd.concat([BleDF, tmpDF], ignore_index=True)

#     tmpDF = pd.DataFrame({'text': [row['all_text']], 'label': [row[LABEL_COLUMN]]})
#     # Check similarity
#     if check_similarity_cofficient(Ecu_value, row[LABEL_COLUMN], 'ecu'):
#         EcuDF = pd.concat([EcuDF, tmpDF], ignore_index=True)

#     if check_similarity_cofficient(Cos_value, row[LABEL_COLUMN], 'cos'):
#         CosDF = pd.concat([CosDF, tmpDF], ignore_index=True)

#     if check_similarity_cofficient(Jac_value, row[LABEL_COLUMN], 'jac'):
#         JacDF = pd.concat([JacDF, tmpDF], ignore_index=True)

#     if check_similarity_cofficient(Bleu_value, row[LABEL_COLUMN], 'bleu'):
#         BleDF = pd.concat([BleDF, tmpDF], ignore_index=True)

# print('All text augmentation is finished ... ')


In [None]:
EcuDF = []
EcuDFum = []
CosDF = []
JacDF = []
BleDF = []
AllDF = []
previous_text = None

for idx, row in df.iterrows():

    original_row = {'text': row['text'], 'label': row['label']}
    new_row = {'text': row['new_text'], 'label': row['label']}
    # print (row['text'])

    #original text
    if idx == 0 or row['text'] != previous_text:
        EcuDF.append(original_row.copy())
        EcuDFum.append(original_row.copy())
        CosDF.append(original_row.copy())
        JacDF.append(original_row.copy())
        BleDF.append(original_row.copy())
        AllDF.append(original_row.copy())

    AllDF.append(new_row.copy())

    # Check similarity and add augmentation
    if check_similarity_cofficient(row['ecu_similarity'], row['label'], 'ecu'):
        EcuDF.append(new_row.copy())
    else:
        EcuDFum.append(new_row.copy())

    if check_similarity_cofficient(row['cos_similarity'], row['label'], 'cos'):
        CosDF.append(new_row.copy())

    if check_similarity_cofficient(row['jacc_similarity'], row['label'], 'jac'):
        JacDF.append(new_row.copy())

    if check_similarity_cofficient(row['bleu_sim_1'], row['label'], 'bleu'):
        BleDF.append(new_row.copy())

    previous_text = row['text']

# Convert lists to DataFrames
AllDF = pd.DataFrame(AllDF)
EcuDF = pd.DataFrame(EcuDF)
EcuDFum = pd.DataFrame(EcuDFum)
CosDF = pd.DataFrame(CosDF)
JacDF = pd.DataFrame(JacDF)
BleDF = pd.DataFrame(BleDF)


In [16]:
# Export dataset
EcuDF.to_excel( "data/data_baru/Augmented-Dataset/RedoAll/"+datasetname+"-Augmented-ECU-ALL-Text-Final.xlsx", index=False)
EcuDFum.to_excel( "data/data_baru/Augmented-Dataset/RedoAll/"+datasetname+"-Augmented-ECULOW-ALL-Text-Final.xlsx", index=False)
CosDF.to_excel( "data/data_baru/Augmented-Dataset/RedoAll/"+datasetname+"-Augmented-COS-ALL-Text-Final.xlsx", index=False)
JacDF.to_excel( "data/data_baru/Augmented-Dataset/RedoAll/"+datasetname+"-Augmented-JAC-ALL-Text-Final.xlsx", index=False)
BleDF.to_excel( "data/data_baru/Augmented-Dataset/RedoAll/"+datasetname+"-Augmented-BLE-ALL-Text-Final.xlsx", index=False)
AllDF.to_excel( "data/data_baru/Augmented-Dataset/RedoAll/"+datasetname+"-Augmented-All-ALL-Text-Final.xlsx", index=False)

### Augmentation (New-Text)

In [17]:
# EcuDF = pd.DataFrame()
# CosDF = pd.DataFrame()
# JacDF = pd.DataFrame()
# BleDF = pd.DataFrame()
# cntr = 1

# print('new text augmentation is started... ')
# for index, row in df.iterrows():         
#     tmpDF = { 'text': row[DATA_COLUMN], 'label': row[LABEL_COLUMN]}
#     Ecu_value = row['ecu_similarity']
#     Cos_value = row['cos_similarity']
#     Jac_value = row['jacc_similarity']
#     Bleu_value = row['bleu_sim_1']
    
#     EcuDF = pd.concat([EcuDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
#     CosDF = pd.concat([CosDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
#     JacDF = pd.concat([JacDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
#     BleDF = pd.concat([BleDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    
#     tmpDF = { 'text': row['all_text'], 'label': row[LABEL_COLUMN]}
#     # Check similarity 
#     if check_similarity_cofficient (Ecu_value, row[LABEL_COLUMN], 'ecu'):
#         EcuDF = pd.concat([EcuDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    
#     if check_similarity_cofficient (Cos_value, row[LABEL_COLUMN], 'cos'):
#         CosDF = pd.concat([CosDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    
#     if check_similarity_cofficient (Jac_value, row[LABEL_COLUMN], 'jac'):
#         JacDF = pd.concat([JacDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    
#     if check_similarity_cofficient (Bleu_value, row[LABEL_COLUMN], 'bleu'):
#         BleDF = pd.concat([BleDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
        
# print('new text augmentation is finished ... ')


In [18]:
# # Export dataset
# EcuDF.to_excel( "data/data_baru/Augmented-Dataset/New/"+datasetname+"-Augmented-ECU-New-Text-Final.xlsx", index=False)
# CosDF.to_excel( "data/data_baru/Augmented-Dataset/New/"+datasetname+"-Augmented-COS-New-Text-Final.xlsx", index=False)
# JacDF.to_excel( "data/data_baru/Augmented-Dataset/New/"+datasetname+"-Augmented-JAC-New-Text-Final.xlsx", index=False)
# BleDF.to_excel( "data/data_baru/Augmented-Dataset/New/"+datasetname+"-Augmented-BLE-New-Text-Final.xlsx", index=False)