In [1]:
import os
os.environ["MODEL_DIR"] = '../model'
import re
import json
from tqdm import tqdm
from itertools import combinations, permutations
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load("en_core_web_md", disable=["ner", "parser"])

In [2]:
train_df = pd.read_csv("xxx/audio_Caps/train_lemma.csv", index_col=0)
train_df.head()

Unnamed: 0_level_0,youtube_id,start_time,caption
audiocap_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
91139,r1nicOVtvkQ,130,a woman talk nearby as water pour
58146,UDGBjjwyaqE,20,multiple clanging and clank sound
11542,3eJ9RynJzP8,80,"the wind be blow , insect be singe , and rustl..."
11543,3eK62q7SnVU,390,the wind be blow and rustling occur
11540,3eGXNIadwGk,30,person be whistle


In [3]:
val_df = pd.read_csv("xxx/audio_Caps/val_lemma.csv", index_col=0)
val_df.head()

Unnamed: 0_level_0,youtube_id,start_time,caption
audiocap_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
97151,vfY_TJq7n_U,130,"rustling occur , duck quack and water splash ,..."
108945,tdWhHV3X25Q,60,an audience give applause as a man yell and a ...
107898,tw76HGONaKg,570,a man speak over intermittent keyboard tap
107893,y2bVZ7rz-5M,280,motor noise be follow by a horn honk and a sir...
107892,ti66RjZWTp0,20,a male speak as metal click and a gun fire once


In [4]:
test_df = pd.read_csv("xxx/audio_Caps/test_lemma.csv", index_col=0)
test_df.head()

Unnamed: 0_level_0,youtube_id,start_time,caption
audiocap_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
103549,7fmOlUlwoNg,20,constant rattle noise and sharp vibration
103548,6BJ455B1aAs,0,a rocket fly by follow by a loud explosion and...
103541,GOD8Bt5LfDE,100,hum and vibrate with a man and child speak and...
103540,YQSuFyFm3Lc,230,a train run on a railroad track follow by a ve...
103542,VjSEIRnLAh8,30,"food be fry , and a woman talk"


In [5]:
useless_tails = ["in the", "of a", "and", "and a", "and a series of", "follow by", "as", "with", "a", "by a", "on the", "and then"]
conjs = ["and", "follow by", "while", "before", "after", "as", "with"]
conj_pattern = "|".join(conjs+[", "+x for x in conjs])
neg_types = ["add_tail", "repeat_event", "repeat_adv", "remove_conj", "remove_verb"]

def aug_event(text):
    aug_types = ["same", "same"]
    if "a" in text:
        aug_types.append("remove_a")
    else:
        aug_types.append("add a")

    if "be" in text:
        aug_types.append("remove_be")
    else:
        aug_types.append("add_be")

    aug_type0 = np.random.choice(aug_types)
    if aug_type0 == "remove_a":
        return text.replace("a ", "")
    elif aug_type0 == "add_a":
        return "a " + text
    elif aug_type0 == "remove_be":
        return text.replace("be ", "")
    elif aug_type0 == "add_be":
        words = text.split()
        return " ".join(words[:-1] + ["a"] + words[-1:])
    else:
        return text

def make_neg(anchor):
    """
    input: 
    anchor: text to make neg
    
    output：
    neg
    binary indicator of the neg type used
    """
    neg_labels = [0 for _ in neg_types]
    max_neg_nums = 2 if np.random.rand() < 0.9 else 1
    
    words = []
    pos = []
    for wd in nlp(anchor):
        words.append(wd.text)
        pos.append(wd.pos_)

    ret = anchor[:]
    for n in range(max_neg_nums):
        # add useless tails
        tmp = np.random.rand()
        if tmp < 0.2:
            to_add = np.random.choice(useless_tails)
            ret = f"{ret} {to_add}"
            neg_labels[0] = 1
            continue

        # add repetition
        tmp = np.random.rand()
        if tmp < 0.4:
            events = re.split(conj_pattern, anchor)
            if len(events) > 0:
                event0 = np.random.choice(events).strip()
                event0 = aug_event(event0)
                conj0 = np.random.choice(conjs[:-1])
                ret = f"{ret} {conj0} {event0}"
                neg_labels[1] = 1
                continue
        
        tmp = np.random.rand()
        if any(pos0 == "ADV" for pos0 in pos) and tmp < 0.1:
            wd0 = None
            for pos0, x in zip(words, pos):
                if pos0 == "ADV":
                    wd0 = x
            # repeat an adv at the end
            ret = f"{ret} {wd0}"
            words.append(wd0)
            pos.append("ADV")
            neg_labels[2] = 1
            continue

        tmp = np.random.rand()
        if any(pos0.endswith("CONJ") for pos0 in pos) and tmp < 0.1:
            rm_id = None
            for i, pos0 in enumerate(pos):
                if pos0.endswith("CONJ"):
                    rm_id = i
            words = words[:i] + words[i+1:]
            pos = pos[:i] + pos[i+1:]
            ret = " ".join(words)
            neg_labels[3] = 1
            continue

        tmp = np.random.rand()
        if any(pos0 == "VERB" for pos0 in pos) and tmp < 0.15:
            rm_id = None
            for i, pos0 in enumerate(pos):
                if pos0 == "VERB":
                    rm_id = i
            words = words[:i] + words[i+1:]
            pos = pos[:i] + pos[i+1:]
            ret = " ".join(words)
            neg_labels[4] = 1
            continue
    
    return ret, neg_labels


In [6]:
anchor = "a baby vocalize and laugh at a woman speak"
make_neg(anchor)

('a baby vocalize and laugh at a woman speak before a baby vocalize',
 [0, 1, 0, 0, 0])

In [7]:
anchor = "a baby vocalize and laugh at a woman speak"
make_neg(anchor)

('a baby vocalize and laugh at a woman', [0, 0, 0, 0, 1])

In [8]:
synthetic_train = []
for rid, row in tqdm(train_df.iterrows()):
    anchor = row["caption"]
    neg, neg_labels = make_neg(anchor)
    synthetic_train.append([neg] + neg_labels)
    if sum(neg_labels) != 0:  # also preserve the original one
        synthetic_train.append([anchor] + [0 for _ in neg_types])
synthetic_train = pd.DataFrame(synthetic_train)

49838it [02:02, 405.39it/s]


In [9]:
synthetic_train.columns = ["caption"] + neg_types
synthetic_train["error"] = synthetic_train.iloc[:, 1:].max(1)
synthetic_train.head()

Unnamed: 0,caption,add_tail,repeat_event,repeat_adv,remove_conj,remove_verb,error
0,a woman talk nearby as water pour and then aft...,1,1,0,0,0,1
1,a woman talk nearby as water pour,0,0,0,0,0,0
2,multiple clanging and clank sound and clank so...,0,1,0,0,0,1
3,multiple clanging and clank sound,0,0,0,0,0,0
4,"the wind be blow , insect be singe , and rustling",0,0,0,0,1,1


In [10]:
synthetic_train.describe()

Unnamed: 0,add_tail,repeat_event,repeat_adv,remove_conj,remove_verb,error
count,90692.0,90692.0,90692.0,90692.0,90692.0,90692.0
mean,0.18823,0.285141,0.014643,0.027169,0.044932,0.45047
std,0.390898,0.451484,0.12012,0.162576,0.207157,0.497543
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
synthetic_val = []
for rid, row in tqdm(val_df.iterrows()):
    anchor = row["caption"]
    neg, neg_labels = make_neg(anchor)
    synthetic_val.append([neg] + neg_labels)
    if sum(neg_labels) != 0:  # also preserve the original one
        synthetic_val.append([anchor] + [0 for _ in neg_types])
synthetic_val = pd.DataFrame(synthetic_val)
synthetic_val.columns = ["caption"] + neg_types
synthetic_val["error"] = synthetic_val.iloc[:, 1:].max(1)
synthetic_val.head()

2475it [00:06, 407.91it/s]


Unnamed: 0,caption,add_tail,repeat_event,repeat_adv,remove_conj,remove_verb,error
0,"rustling occur , duck quack and water splash ,...",0,1,0,0,0,1
1,"rustling occur , duck quack and water splash ,...",0,0,0,0,0,0
2,an audience give applause as a man yell and a ...,1,0,0,0,0,1
3,an audience give applause as a man yell and a ...,0,0,0,0,0,0
4,a man speak over intermittent keyboard tap by ...,1,1,0,0,0,1


In [12]:
synthetic_test = []
for rid, row in tqdm(test_df.iterrows()):
    anchor = row["caption"]
    neg, neg_labels = make_neg(anchor)
    synthetic_test.append([neg] + neg_labels)
    if sum(neg_labels) != 0:  # also preserve the original one
        synthetic_test.append([anchor] + [0 for _ in neg_types])
synthetic_test = pd.DataFrame(synthetic_test)
synthetic_test.columns = ["caption"] + neg_types
synthetic_test["error"] = synthetic_test.iloc[:, 1:].max(1)
synthetic_test.head()

4875it [00:12, 391.83it/s]


Unnamed: 0,caption,add_tail,repeat_event,repeat_adv,remove_conj,remove_verb,error
0,constant rattle noise and sharp vibration,0,0,0,0,0,0
1,a rocket fly by follow by a loud explosion and...,0,1,0,0,0,1
2,a rocket fly by follow by a loud explosion and...,0,0,0,0,0,0
3,hum and vibrate with a man and child speak and...,0,1,0,0,0,1
4,hum and vibrate with a man and child speak and...,0,0,0,0,0,0


In [13]:
os.makedirs("./dataset_audiocaps")
synthetic_train.to_csv("./dataset_audiocaps/synthetic_train.csv", encoding="utf-8")
synthetic_val.to_csv("./dataset_audiocaps/synthetic_val.csv", encoding="utf-8")
synthetic_test.to_csv("./dataset_audiocaps/synthetic_test.csv", encoding="utf-8")