In [2]:
from tqdm import tqdm, trange
import numpy as np
import copy

data_dir = "/home/jack/Desktop/NN/clean/datasets/yelp"

In [3]:
def process_file(input_file, output_file):
    """
    The function process the data files for Delete & Generate and convert
    it for the Delete, Retrieve and Generate training by separating the content
    and attributes. It includes all the attribure words.
    
    Input_file: string : Path of the input file
    Output_file: string : Path of the output file 
    """
    
    with open(input_file) as fp:
        data = fp.read().splitlines()
    with open (output_file,"w") as out_fp:
        for x in tqdm(data):
            temp = x.split("<START>")
            con = temp[0].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","")
            sen = temp[1].replace("<END>","")
            lt1 = con.split()
            lt2 = sen.split()
            att_tokens = [z for z in lt2 if z not in lt1]
            max_atts = 0
            if len(att_tokens) > max_atts:
                max_atts = len(att_tokens)
            att_words = " ".join(att_tokens)
            out_str = "<ATTR_WORDS> " + att_words + " <CON_START> " + con.strip() + " <START> " + sen.strip() + " <END>" + "\n"
            out_fp.write(out_str)
    

In [21]:
def process_file_v1(input_file, output_file, test = False):
    """
    The function process the data files for Delete & Generate and convert
    it for the Delete, Retrieve and Generate training by separating the content
    and attributes. 
    It randomly picks 70% of the attributes only to make the generation
    more realistic instead of just filling the blanks, which helps while generating
    sentences for test cases.
    
    Input_file: string : Path of the input file
    Output_file: string : Path of the output file 
    """
    with open(input_file) as fp:
        data = fp.read().splitlines()
    with open (output_file,"w") as out_fp:
        for x in tqdm(data):
            temp = x.split("<START>")
            con = temp[0].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","")
            sen = temp[1].replace("<END>","")
            lt1 = con.split()
            org_lt1 = copy.deepcopy(lt1)
            lt2 = sen.split()
           
            
            att_words = [z for z in lt2 if z not in lt1]
            att_words = list(reversed(sorted(att_words, key=len)))
            index_att = []

            # Don't put special char in attribute words. Reduce att -> reduce confusion
            my_att = []

            special_characters =  "!@#$%^&*()-+?_=<>/\'\'"
            for index, word  in enumerate(lt2) :
                if word in special_characters:
                    continue
                if word in lt1:
                    continue
                if any(c in special_characters for c in word) and len(word) < 3:
                    continue
                if "-" in word:
                    splitted_words = word.split("-")
                    if splitted_words[0] in lt1 and splitted_words[1] in lt1:
                        continue
                if any(c in special_characters for c in word) and len(word) < 4:
                    splitted_words = word.split("\'")
                    if len(splitted_words)>0 and splitted_words[0] in lt1 and splitted_words[1] in lt1:
                        continue
                # Attribute found
                my_att.append(word)
                # Remember index
                index_att.append(index)
            

            att_words = my_att

            set_replace_tokens(lt1, lt2, att_words, index_att)
            
            # Remove special chars first
            if len(att_words) > 2:
                for index,word in enumerate(att_words):
                    if any(c in special_characters for c in word) and len(att_words) > 2:
                        del att_words[index]


            if len(att_words) > 2:
                indx = np.array(list(range(len(att_words))))
                # Pref delete short words
                #print("attr b4", att_words)
                att_words = " ".join([att_words[indx[k]] for k in range(int(0.7 * len(att_words)))])
                #print("attr after", att_words)

                #np.random.shuffle(indx)
            else: # If attributes less than 2 then keep all the attributes
                att_words = " ".join(att_words)
            if(test):
                out_str = "<ATTR_WORDS> " + att_words + " <CON_START> " +  " ".join(lt1).strip() + " <START> " + "\n"
            else:
                out_str = "<ATTR_WORDS> " + att_words + " <CON_START> " +  " ".join(lt1).strip() + " <START> " + sen.strip() + " <END>"  + "\n"
            out_fp.write(out_str)

def set_replace_tokens(content_list, full_list, att_list, index_att_list, second_try = False):
    street_index = 0
    last_replace_index = None
    insert_index = None
    for index, replace_index in enumerate(index_att_list):
        # Catch street 
        if last_replace_index == replace_index - 1:
            content_list.insert(replace_index, "<REPLACE>")
            last_replace_index = replace_index
            continue

        last_replace_index = replace_index
        # Catch first street if starts with 0
        if replace_index == street_index:
            street_index += 1
            content_list.insert(replace_index, "<REPLACE>")
            continue

        left_matches = []
        right_matches = []
        left_word_query = None
        right_word_query = None
        
        if replace_index-1 >= 0:
            left_word_query = full_list[replace_index-1]
            left_matches = [(index,x) for index,x in enumerate(content_list) if x == left_word_query]

        if replace_index+1 < len(full_list):
            right_word_query = full_list[replace_index+1]
            right_matches = [(index,x) for index,x in enumerate(content_list) if x == right_word_query]

        if len(left_matches) == 1:
            insert_index = left_matches[0][0]+1
            content_list.insert(insert_index, "<REPLACE>")
            continue
        
        if len(right_matches) == 1:
            insert_index = right_matches[0][0]
            content_list.insert(insert_index, "<REPLACE>")            
            continue


        left_concat_matches = []
        right_concat_matches = []
        if left_word_query != None:
            left_concat_matches = [(index,x) for index,x in enumerate(content_list) if index-1 > -1 and content_list[index-1]+content_list[index] == left_word_query]
        if right_word_query != None:
            right_concat_matches = [(index,x) for index,x in enumerate(content_list) if index+1 < len(content_list) and content_list[index]+content_list[index+1] == right_word_query]
        
        if len(left_concat_matches) == 1: 
            insert_index = left_concat_matches[0][0]+1
            content_list.insert(insert_index, "<REPLACE>")
            continue

        if len(right_concat_matches) == 1: 
            insert_index = right_concat_matches[0][0]
            content_list.insert(insert_index, "<REPLACE>")
            continue


        left_shifted_match = []
        right_shifted_matches = []
        if replace_index-2 >= 0:
            left_word_query_shift = full_list[replace_index-2]
            left_shifted_match = [(index,x) for index,x in enumerate(content_list) if index-1 > -1 and content_list[index-1] == left_word_query_shift]
        if replace_index+2 < len(full_list):
            right_word_query_shift = full_list[replace_index+2]
            right_shifted_matches = [(index,x) for index,x in enumerate(content_list) if index+1 < len(content_list) and content_list[index+1] == right_word_query_shift]

        if len(left_shifted_match) == 1: 
            insert_index = left_shifted_match[0][0]+1
            content_list.insert(insert_index, "<REPLACE>")
            continue

        if len(right_shifted_matches) == 1: 
            insert_index = right_shifted_matches[0][0]
            content_list.insert(insert_index, "<REPLACE>")
            continue

        if len(left_matches) == 0 and len(right_matches) == 0:
            print("NONE", index)
            print(att_list)
            print(index_att_list)
            print(content_list)
            print(full_list)

def next_is_street(index_att_list, current_index):
    if len(index_att_list) > current_index+1 and  index_att_list[current_index] + 1 == index_att_list[current_index+1]:
        return True
    else:
        return False

def get_street_length(index_att_list, current_index):
    counter = 0
    while len(index_att_list) > current_index+1 and  index_att_list[current_index] + 1 == index_att_list[current_index+1]:
        counter += 1
        current_index += 1
    return counter



In [22]:
# Test changes here first
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test.txt")
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test_1.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_1.txt")
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test_0.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_0.txt")

100%|██████████| 1000/1000 [00:00<00:00, 51666.08it/s]
100%|██████████| 500/500 [00:00<00:00, 54584.90it/s]
100%|██████████| 500/500 [00:00<00:00, 48445.38it/s]


In [44]:
'''Merge files'''
# Create sentiment_train file before run!
# Create sentiment_test file before run!
# Create sentiment_dev file before run!

filenames = [data_dir+"/processed_files_with_bert_with_best_head/sentiment_train_1.txt", data_dir+"/processed_files_with_bert_with_best_head/sentiment_train_0.txt"]
with open(data_dir+"/processed_files_with_bert_with_best_head/sentiment_train.txt", 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)


filenames = [data_dir+"/processed_files_with_bert_with_best_head/sentiment_test_1.txt", data_dir+"/processed_files_with_bert_with_best_head/sentiment_test_0.txt"]
with open(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test.txt", 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

filenames = [data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev_1.txt", data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev_0.txt"]
with open(data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev.txt", 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

In [14]:

process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_train.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_all_attrs.txt")
process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_train_1.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1_all_attrs.txt")
process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_train_0.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0_all_attrs.txt")

100%|██████████| 443259/443259 [00:01<00:00, 314534.16it/s]
100%|██████████| 266041/266041 [00:00<00:00, 344643.63it/s]
100%|██████████| 177218/177218 [00:00<00:00, 316493.76it/s]


In [12]:
process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_all_attrs.txt")
process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test_1.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_1_all_attrs.txt")
process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test_0.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_0_all_attrs.txt")

100%|██████████| 1000/1000 [00:00<00:00, 142784.82it/s]
100%|██████████| 500/500 [00:00<00:00, 265529.50it/s]
100%|██████████| 500/500 [00:00<00:00, 269660.79it/s]


In [None]:
process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_all_attrs.txt")
process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev_0.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_1_all_attrs.txt")
process_file(data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev_1.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_0_all_attrs.txt")

In [None]:
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_train.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train.txt")
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_train_1.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1.txt")
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_train_0.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0.txt")

In [158]:
# Test changes here first
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test.txt")
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test_1.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_1.txt")
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_test_0.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_0.txt")

100%|██████████| 1000/1000 [00:00<00:00, 56367.48it/s]
100%|██████████| 500/500 [00:00<00:00, 71094.72it/s]
100%|██████████| 500/500 [00:00<00:00, 66485.50it/s]


In [None]:
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev.txt", test=True)
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev_0.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_1.txt", test=True)
process_file_v1(data_dir+"/processed_files_with_bert_with_best_head/sentiment_dev_1.txt",data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_0.txt", test=True)

In [6]:
def read_file(file_path):
    with open(file_path) as fp:
        data = fp.read().splitlines()
    return data

In [14]:
# Process real ref with antonmys attributes - Do NOT execute

ref_original_path_0 = data_dir+"/reference_0_org.txt"
ref_original_path_1 = data_dir+"/reference_1_org.txt"

ref_0_data = read_file(ref_original_path_0)
ref_1_data = read_file(ref_original_path_1)

original_data_0 = [x.split("	")[0] for x in ref_0_data]
original_data_1 = [x.split("	")[0] for x in ref_1_data]

ref_content_path_0 = data_dir+"/processed_files_with_bert_with_best_head/reference_0.txt"
ref_content_path_1 = data_dir+"/processed_files_with_bert_with_best_head/reference_1.txt"

original_content_data_0 = read_file(ref_content_path_0)
original_content_data_1 = read_file(ref_content_path_1)


output_0_content_org = data_dir+"/processed_files_with_bert_with_best_head/reference_content_org_0.txt"
output_1_content_org = data_dir+"/processed_files_with_bert_with_best_head/reference_content_org_1.txt"

print(original_content_data_0[2]+original_data_0[2])

with open(output_0_content_org, 'w') as outfile:
    for index, content in enumerate(original_content_data_0):
        outfile.write(content+original_data_0[index]+"\n")

with open(output_1_content_org, 'w') as outfile:
    for index, content in enumerate(original_content_data_1):
            outfile.write(content+original_data_1[index]+"\n")

#content_with_att_0 = process_file_v1()

ref_out_path_0 = data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/reference_0.txt"
ref_out_path_1 = data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/reference_1.txt"



<POS> <CON_START> so basically down . <START>so basically tasted watered down .


In [15]:
process_file_v1(output_0_content_org, ref_out_path_0, True)
process_file_v1(output_1_content_org, ref_out_path_1, True)

100%|██████████| 500/500 [00:00<00:00, 29555.25it/s]


attr b4 ['said', 'disappeared', 'minutes']
attr after said disappeared
attr b4 ['terrible', 'very', 'good']
attr after terrible very
attr b4 ['owner', 'heard', 'do']
attr after owner heard
attr b4 ['sit', 'slow', 'lazy']
attr after sit slow
attr b4 ['no', 'sorry', 'everything']
attr after no sorry
attr b4 ['said', 'sit', 'table']
attr after said sit
attr b4 ['empty', 'no', 'store']
attr after empty no
attr b4 ['staffed', 'primarily', 'teenagers']
attr after staffed primarily
attr b4 ['blue', 'cheese', 'best']
attr after blue cheese
attr b4 ['pad', 'tasted', 'noodles']
attr after pad tasted
attr b4 ['complained', 'polite', 'walked']
attr after complained polite
attr b4 ['anyway', 'got', 'coffee']
attr after anyway got
attr b4 ['just', 'delivery', 'wasted']
attr after just delivery
attr b4 ["n't", 'let', 'me']
attr after n't let
attr b4 ['me', 'lied', 'bs']
attr after me lied
attr b4 ['tried', 'advantage', 'am']
attr after tried advantage
attr b4 ["'ve", 'sent', 'guests', 'absolutely']
a

100%|██████████| 500/500 [00:00<00:00, 46999.22it/s]

attr b4 ['small', 'make', 'right']
attr after small make
attr b4 ['actually', 'can', 'wait']
attr after actually can
attr b4 ['friendly', 'delicious', 'authentic']
attr after friendly delicious
attr b4 ["'ll", 'why', 'get']
attr after 'll why
attr b4 ['variety', 'makes', 'good']
attr after variety makes
attr b4 ['professional', 'found', 'right']
attr after professional found
attr b4 ['important', 'thing', 'food']
attr after important thing
attr b4 ['good', 'impressed', 'quality']
attr after good impressed
attr b4 ['great', 'nice', 'steal']
attr after great nice
attr b4 ['still', 'comes', 'right']
attr after still comes
attr b4 ['love', 'location', 'right']
attr after love location
attr b4 ['happy', 'definitely', 'services']
attr after happy definitely
attr b4 ['seems', 'pretty', 'high']
attr after seems pretty
attr b4 ['great', 'grab', 'meal']
attr after great grab
attr b4 ['reasonably', 'great', 'organic']
attr after reasonably great
attr b4 ['first', 'knew', 'new']
attr after first k




In [225]:
def read_file(file_path):
    with open(file_path) as fp:
        data = fp.read().splitlines()
    return data