### Imports

In [None]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
import warnings
warnings.filterwarnings('ignore')


### Utils

In [None]:
def calculate_bleu_scores(references, hypotheses):
    """
    Calculates BLEU 1-4 scores based on NLTK functionality

    Args:
        references: List of reference sentences
        hypotheses: List of generated sentences

    Returns:
        bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores

    """
    #return len(references), len(hypotheses)
    bleu_1 = np.round(corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2)
    bleu_2 = np.round(corpus_bleu(references, hypotheses, weights=(0.50, 0.50, 0., 0.)), decimals=2)
    bleu_3 = np.round(corpus_bleu(references, hypotheses, weights=(0.34, 0.33, 0.33, 0.)), decimals=2)
    bleu_4 = np.round(corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)), decimals=2)
    return bleu_1, bleu_2, bleu_3, bleu_4 

# Functions
def check_label (label):
    for lbl in LABEL_TO_AUGMENT:
        if lbl.upper() == label.upper():
            return True
    return False        

def check_similarity_cofficient (given_value, label, current_sim_coff):
#     if not check_label(label):
#         return False
#     else:
    try:
        v = float(given_value)
        if float(SIM_COFFICIENTS_THRESHOLDS[current_sim_coff.upper()]) >= v:
            return True
        else:
            return False
    except:
        print('exception')

### Dataset Loading

In [None]:
datasetname = 'Cyberbullying_90_16_f_NO_RS'
# datasetpath = "datasets/xls/ArSarcasm-Unbalanced-Augmented-aragpt2-base.csv"
df = pd.read_csv( "data/data_baru/data_training_90_16_f_NO_RS.txt", sep="\t", encoding='utf-8')
# df.columns = ['text', 'label', 'new_text', 'all_text', 'original_embbedding', 'new_embbedding', 'ecu_similarity', 'cos_similarity', 'jacc_similarity','text_split', 'all_text_split', 'new_text_split', 'bleu_sim_1','bleu_sim_2', 'bleu_sim_3', 'bleu_sim_4']
df.columns = ['text', 'label', 'all_text', 'original_embbedding', 'new_embbedding', 'ecu_similarity', 'cos_similarity', 'jacc_similarity', 'bleu_sim_1']
df.head()

In [None]:
# df['label'].value_counts()

### Calculating Similarity Measures

In [None]:
# df['text_split'] = [list(x.split()) for x in df['text']]
# df['all_text_split'] = [x.split() for x in df['all_text']]
# df['new_text_split'] = [str(x).split() for x in df['new_text']]

In [None]:
#  df[['bleu_sim_1','bleu_sim_2','bleu_sim_3','bleu_sim_4']] = [ calculate_bleu_scores ([[x]],[y]) for x, y in zip(df['text_split'], df['all_text_split'])]


In [None]:
#  df[['bleu_sim_1','bleu_sim_1','bleu_sim_1','bleu_sim_1']]

In [None]:
df["ecu_similarity"].mean()

In [None]:
df["cos_similarity"].mean()

In [None]:
df["jacc_similarity"].mean()

In [None]:
df["bleu_sim_1"].mean()

In [None]:
# Parameters
all_datasets= []
SIM_COFFICIENTS_THRESHOLDS = {'ECU': df["ecu_similarity"].mean(), 'COS':df["cos_similarity"].mean(), 'JAC':df["jacc_similarity"].mean(), 'BLEU':df["bleu_sim_1"].mean()}
LABEL_TO_AUGMENT = ['positive', 'negative']
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

In [None]:
SIM_COFFICIENTS_THRESHOLDS

### Augmentation (All-Text)

In [None]:
EcuDF = pd.DataFrame()
CosDF = pd.DataFrame()
JacDF = pd.DataFrame()
BleDF = pd.DataFrame()
cntr = 1

print('All text augmentation is started... ')
for index, row in df.iterrows():
    tmpDF = pd.DataFrame({'text': [row[DATA_COLUMN]], 'label': [row[LABEL_COLUMN]]})
    Ecu_value = row['ecu_similarity']
    Cos_value = row['cos_similarity']
    Jac_value = row['jacc_similarity']
    Bleu_value = row['bleu_sim_1']

    EcuDF = pd.concat([EcuDF, tmpDF], ignore_index=True)
    CosDF = pd.concat([CosDF, tmpDF], ignore_index=True)
    JacDF = pd.concat([JacDF, tmpDF], ignore_index=True)
    BleDF = pd.concat([BleDF, tmpDF], ignore_index=True)

    tmpDF = pd.DataFrame({'text': [row['all_text']], 'label': [row[LABEL_COLUMN]]})
    # Check similarity
    if check_similarity_cofficient(Ecu_value, row[LABEL_COLUMN], 'ecu'):
        EcuDF = pd.concat([EcuDF, tmpDF], ignore_index=True)

    if check_similarity_cofficient(Cos_value, row[LABEL_COLUMN], 'cos'):
        CosDF = pd.concat([CosDF, tmpDF], ignore_index=True)

    if check_similarity_cofficient(Jac_value, row[LABEL_COLUMN], 'jac'):
        JacDF = pd.concat([JacDF, tmpDF], ignore_index=True)

    if check_similarity_cofficient(Bleu_value, row[LABEL_COLUMN], 'bleu'):
        BleDF = pd.concat([BleDF, tmpDF], ignore_index=True)

print('All text augmentation is finished ... ')


In [None]:
# Export dataset
EcuDF.to_excel( "data/data_baru/Augmented-Dataset/All/"+datasetname+"-Augmented-ECU-ALL-Text-Final.xlsx", index=False)
CosDF.to_excel( "data/data_baru/Augmented-Dataset/All/"+datasetname+"-Augmented-COS-ALL-Text-Final.xlsx", index=False)
JacDF.to_excel( "data/data_baru/Augmented-Dataset/All/"+datasetname+"-Augmented-JAC-ALL-Text-Final.xlsx", index=False)
BleDF.to_excel( "data/data_baru/Augmented-Dataset/All/"+datasetname+"-Augmented-BLE-ALL-Text-Final.xlsx", index=False)

In [None]:
# df.to_excel( "data/data_baru/Augmented-Dataset/xls/ArSarcasm-Unbalanced-aragpt2-base.xlsx", index=False)

### Augmentation (New-Text)

In [None]:
EcuDF = pd.DataFrame()
CosDF = pd.DataFrame()
JacDF = pd.DataFrame()
BleDF = pd.DataFrame()
cntr = 1

print('new text augmentation is started... ')
for index, row in df.iterrows():         
    tmpDF = { 'text': row[DATA_COLUMN], 'label': row[LABEL_COLUMN]}
    Ecu_value = row['ecu_similarity']
    Cos_value = row['cos_similarity']
    Jac_value = row['jacc_similarity']
    Bleu_value = row['bleu_sim_1']
    
    EcuDF = pd.concat([EcuDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    CosDF = pd.concat([CosDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    JacDF = pd.concat([JacDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    BleDF = pd.concat([BleDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    
    tmpDF = { 'text': row['all_text'], 'label': row[LABEL_COLUMN]}
    # Check similarity 
    if check_similarity_cofficient (Ecu_value, row[LABEL_COLUMN], 'ecu'):
        EcuDF = pd.concat([EcuDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    
    if check_similarity_cofficient (Cos_value, row[LABEL_COLUMN], 'cos'):
        CosDF = pd.concat([CosDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    
    if check_similarity_cofficient (Jac_value, row[LABEL_COLUMN], 'jac'):
        JacDF = pd.concat([JacDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
    
    if check_similarity_cofficient (Bleu_value, row[LABEL_COLUMN], 'bleu'):
        BleDF = pd.concat([BleDF, pd.DataFrame(tmpDF, index=[0])], ignore_index=True)
        
print('new text augmentation is finished ... ')


In [None]:
# Export dataset
EcuDF.to_excel( "data/data_baru/Augmented-Dataset/New/"+datasetname+"-Augmented-ECU-New-Text-Final.xlsx", index=False)
CosDF.to_excel( "data/data_baru/Augmented-Dataset/New/"+datasetname+"-Augmented-COS-New-Text-Final.xlsx", index=False)
JacDF.to_excel( "data/data_baru/Augmented-Dataset/New/"+datasetname+"-Augmented-JAC-New-Text-Final.xlsx", index=False)
BleDF.to_excel( "data/data_baru/Augmented-Dataset/New/"+datasetname+"-Augmented-BLE-New-Text-Final.xlsx", index=False)