In [1]:
import os
os.environ["MODEL_DIR"] = '../model'
import re
import json
from tqdm import tqdm
from itertools import combinations, permutations
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load("en_core_web_md", disable=["ner", "parser"])

In [2]:
train_df = pd.read_csv("xxx/clotho/dev.csv", index_col=0)
train_df.head()

Unnamed: 0_level_0,caption_1,caption_2,caption_3,caption_4,caption_5
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Distorted AM Radio noise.wav,A muddled noise of broken channel of the TV,A television blares the rhythm of a static TV.,Loud television static dips in and out of focus,The loud buzz of static constantly changes pit...,heavy static and the beginnings of a signal on...
Paper_Parchment_Rustling.wav,A person is turning a map over and over.,A person is very carefully rapping a gift for ...,A person is very carefully wrapping a gift for...,"He sighed as he turned the pages of the book, ...","papers are being turned, stopped, then turned ..."
03 Whales Slowing Down.wav,Several barnyard animals mooing in a barn whil...,"The vocalization of several whales, along with...","Underwater, large numbers of shrimp clicking a...",Whales sing to one another over the flowing wa...,wales sing to one another with water flowing i...
Rope tied to boat in port.wav,An office chair is squeaking as someone bends ...,Popping and squeaking gradually tapers off to ...,Someone is opening a creaky door slowly while ...,Squeaking and popping followed by gradual popp...,an office chair is squeaking as someone leans ...
carpenter bee.wav,A flying bee is buzzing loudly around an objec...,An annoying fly is buzzing loudly and consiste...,An insect buzzing in the foreground as birds c...,"An insect trapped in a spider web struggles, b...","Outdoors, insect trapped in a spider web and t..."


In [3]:
val_df = pd.read_csv("xxx/clotho/eval.csv", index_col=0)
val_df.head()

Unnamed: 0_level_0,caption_1,caption_2,caption_3,caption_4,caption_5
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Santa Motor.wav,A machine whines and squeals while rhythmicall...,A person is using electric clippers to trim bu...,Someone is trimming the bushes with electric c...,The whirring of a pump fills a bladder that tu...,"While rhythmically punching or stamping, a mac..."
Radio Garble.wav,A radio dispatcher and an officer are communic...,Communication with a walkie-talkie with a lot ...,A discussion with a walkie-talkie with a consi...,People talking through a walkie-talkie with ba...,The walkie-talkie the people are talking throu...
Radio Fuzz for Old Radio Broadcast FF233.wav,A radio tuner has been positioned in between r...,A transistor radio is being played on a statio...,A transistor radio is on a station that is not...,Radio static makes a constant hum with a high ...,Static coming from a radio that is in between ...
toy rattle 2.wav,A person winding up a device and then jingling...,A socket wrench that is tightening a bolt.,An object is tightened and then metallic objec...,"Before keys are jangled on their chain, someon...",Someone is spinning around a lock with a dial.
Blade Big.wav,A person is pulling silverware out of the dish...,A person removes a knife from its holder then ...,A person taking a knife out of its holder and ...,Metal sliding together such as swords or knives.,The metallic clang of swords and knives striki...


In [4]:
test_df = val_df.copy()

In [5]:
useless_tails = ["in the", "of a", "and", "and a", "and a series of", "follow by", "as", "with", "a", "by a", "on the", "and then"]
conjs = ["and", "follow by", "while", "before", "after", "as", "with"]
conj_pattern = "|".join(conjs+[", "+x for x in conjs])
neg_types = ["add_tail", "repeat_event", "repeat_adv", "remove_conj", "remove_verb"]

def aug_event(text):
    aug_types = ["same", "same"]
    if "a" in text:
        aug_types.append("remove_a")
    else:
        aug_types.append("add a")

    if "be" in text:
        aug_types.append("remove_be")
    else:
        aug_types.append("add_be")

    aug_type0 = np.random.choice(aug_types)
    if aug_type0 == "remove_a":
        return text.replace("a ", "")
    elif aug_type0 == "add_a":
        return "a " + text
    elif aug_type0 == "remove_be":
        return text.replace("be ", "")
    elif aug_type0 == "add_be":
        words = text.split()
        return " ".join(words[:-1] + ["a"] + words[-1:])
    else:
        return text

def make_neg(anchor):
    """
    input: 
    anchor: text to make neg
    
    output：
    neg
    binary indicator of the neg type used
    """
    neg_labels = [0 for _ in neg_types]
    max_neg_nums = 2 if np.random.rand() < 0.9 else 1
    
    words = []
    pos = []
    for wd in nlp(anchor):
        words.append(wd.text)
        pos.append(wd.pos_)

    ret = anchor[:]
    for n in range(max_neg_nums):
        # add useless tails
        tmp = np.random.rand()
        if tmp < 0.2:
            to_add = np.random.choice(useless_tails)
            ret = f"{ret} {to_add}"
            neg_labels[0] = 1
            continue

        # add repetition
        tmp = np.random.rand()
        if tmp < 0.4:
            events = re.split(conj_pattern, anchor)
            if len(events) > 0:
                event0 = np.random.choice(events).strip()
                event0 = aug_event(event0)
                conj0 = np.random.choice(conjs[:-1])
                ret = f"{ret} {conj0} {event0}"
                neg_labels[1] = 1
                continue
        
        tmp = np.random.rand()
        if any(pos0 == "ADV" for pos0 in pos) and tmp < 0.1:
            wd0 = None
            for pos0, x in zip(words, pos):
                if pos0 == "ADV":
                    wd0 = x
            # repeat an adv at the end
            ret = f"{ret} {wd0}"
            words.append(wd0)
            pos.append("ADV")
            neg_labels[2] = 1
            continue

        tmp = np.random.rand()
        if any(pos0.endswith("CONJ") for pos0 in pos) and tmp < 0.1:
            rm_id = None
            for i, pos0 in enumerate(pos):
                if pos0.endswith("CONJ"):
                    rm_id = i
            words = words[:i] + words[i+1:]
            pos = pos[:i] + pos[i+1:]
            ret = " ".join(words)
            neg_labels[3] = 1
            continue

        tmp = np.random.rand()
        if any(pos0 == "VERB" for pos0 in pos) and tmp < 0.15:
            rm_id = None
            for i, pos0 in enumerate(pos):
                if pos0 == "VERB":
                    rm_id = i
            words = words[:i] + words[i+1:]
            pos = pos[:i] + pos[i+1:]
            ret = " ".join(words)
            neg_labels[4] = 1
            continue
    
    return ret, neg_labels


In [6]:
anchor = "a baby vocalize and laugh at a woman speak"
make_neg(anchor)

('a baby vocalize and laugh at a woman', [0, 0, 0, 1, 0])

In [7]:
anchor = "a baby vocalize and laugh at a woman speak"
make_neg(anchor)

('a baby vocalize and laugh at a woman speak before baby vocalize',
 [0, 1, 0, 0, 0])

In [11]:
synthetic_train = []
for rid, row in tqdm(train_df.iterrows()):
    for anchor in row.values:
        neg, neg_labels = make_neg(anchor)
        synthetic_train.append([neg] + neg_labels)
        if sum(neg_labels) != 0:  # also preserve the original one
            synthetic_train.append([anchor] + [0 for _ in neg_types])
synthetic_train = pd.DataFrame(synthetic_train)

2893it [00:32, 89.10it/s]


In [17]:
synthetic_train.columns = ["caption"] + neg_types
synthetic_train["error"] = synthetic_train.iloc[:, 1:].max(1)
synthetic_train.head()

Unnamed: 0,caption,add_tail,repeat_event,repeat_adv,remove_conj,remove_verb,error
0,A muddled noise of broken channel of the TV be...,0,1,0,0,0,1
1,A muddled noise of broken channel of the TV,0,0,0,0,0,0
2,A television blares the rhythm of a static TV....,1,1,0,0,0,1
3,A television blares the rhythm of a static TV.,0,0,0,0,0,0
4,Loud television static dips in and out of focu...,0,1,0,0,0,1


In [20]:
synthetic_train.describe()

Unnamed: 0,add_tail,repeat_event,repeat_adv,remove_conj,remove_verb,error
count,26592.0,26592.0,26592.0,26592.0,26592.0,26592.0
mean,0.186297,0.280724,0.020758,0.030874,0.058965,0.456039
std,0.389353,0.449361,0.142576,0.172979,0.235564,0.498073
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
synthetic_val = []
for rid, row in tqdm(val_df.iterrows()):
    for anchor in row.values:
        neg, neg_labels = make_neg(anchor)
        synthetic_val.append([neg] + neg_labels)
        if sum(neg_labels) != 0:  # also preserve the original one
            synthetic_val.append([anchor] + [0 for _ in neg_types])
synthetic_val = pd.DataFrame(synthetic_val)
synthetic_val.columns = ["caption"] + neg_types
synthetic_val["error"] = synthetic_val.iloc[:, 1:].max(1)
synthetic_val.head()

1045it [00:12, 82.25it/s]


Unnamed: 0,caption,add_tail,repeat_event,repeat_adv,remove_conj,remove_verb,error
0,A machine whines and squeals while rhythmicall...,1,1,0,0,0,1
1,A machine whines and squeals while rhythmicall...,0,0,0,0,0,0
2,A person is using electric clippers to trim bu...,0,1,0,0,0,1
3,A person is using electric clippers to trim bu...,0,0,0,0,0,0
4,Someone is trimming the bushes with electric c...,0,1,0,0,0,1


In [15]:
synthetic_test = synthetic_val.copy()

In [19]:
os.makedirs("./dataset_clotho", exist_ok=True)
synthetic_train.to_csv("./dataset_clotho/synthetic_train.csv", encoding="utf-8")
synthetic_val.to_csv("./dataset_clotho/synthetic_val.csv", encoding="utf-8")
synthetic_test.to_csv("./dataset_clotho/synthetic_test.csv", encoding="utf-8")

## combine 2 dataset

In [22]:
synthetic_train_audiocaps = pd.read_csv("./dataset_audiocaps/synthetic_train.csv", index_col=0)
synthetic_val_audiocaps = pd.read_csv("./dataset_audiocaps/synthetic_val.csv", index_col=0)
synthetic_test_audiocaps = pd.read_csv("./dataset_audiocaps/synthetic_test.csv", index_col=0)
synthetic_train_combine = pd.concat([synthetic_train, synthetic_train_audiocaps])
synthetic_val_combine = pd.concat([synthetic_val, synthetic_val_audiocaps])
synthetic_test_combine = pd.concat([synthetic_test, synthetic_test_audiocaps])
os.makedirs("./dataset", exist_ok=True)
synthetic_train_combine.to_csv("./dataset/synthetic_train.csv", encoding="utf-8")
synthetic_val_combine.to_csv("./dataset/synthetic_val.csv", encoding="utf-8")
synthetic_test_combine.to_csv("./dataset/synthetic_test.csv", encoding="utf-8")