## Prepare Referece files using TFIDF for retrieving attributes


In [1]:
import pandas as pd
from tqdm import tqdm, trange
import numpy as np
import time
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def read_file(path):
    with open(path) as fp:
        lines = fp.read().splitlines()
    return lines

In [3]:
def clean_text(text):
    return text.replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","").replace("<END>","").strip()


In [6]:
data_dir = "/home/jack/Desktop/NN/clean/datasets/yelp"

train0_org = read_file(data_dir+"/sentiment.train.0") # Training data of negative sentiment
train1_org = read_file(data_dir+"/sentiment.train.1") # Training data of positive sentiment
ref0_processed = read_file(data_dir+"/processed_files_with_bert_with_best_head/reference_0.txt") # Reference data for delete_generate model
ref1_processed = read_file(data_dir+"/processed_files_with_bert_with_best_head/reference_1.txt") # Reference data for delete_generate model
ref0_org = read_file(data_dir+"/reference.0") # Original Refrence_0 data
ref1_org = read_file(data_dir+"/reference.1") # Original Refrence_1 data
train0_processed = read_file(data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0_all_attrs.txt") # training data with content and attributes seperation
train1_processed = read_file(data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1_all_attrs.txt") # training data with content and attributes seperation

In [7]:
# Get the Original Reference Sentence
ref0_org = [x.split("\t")[0] for x in ref0_org]
ref1_org = [x.split("\t")[0] for x in ref1_org]

In [8]:
# Get the Content of the Reference Sentences
ref0_con = [clean_text(x) for x in ref0_processed]
ref1_con = [clean_text(x) for x in ref1_processed]

In [9]:
ref0_org[:4], ref0_con[:4]

(["ever since joes has changed hands it 's just gotten worse and worse .",
  'there is definitely not enough room in that part of the venue .',
  'so basically tasted watered down .',
  "she said she 'd be back and disappeared for a few minutes ."],
 ["ever since joes has changed hands it ' just gotten and .",
  'there is not enough room in that part of the .',
  'so basically down .',
  "she she ' d be back and for a few ."])

In [10]:
def get_train_content(text):
    return text.split("<START>")[0].split("<CON_START>")[1].strip()

In [11]:
def get_train_attrs(text):
    return text.split("<CON_START>")[0].replace("<ATTR_WORDS>","").strip().split()

In [12]:
get_train_attrs(train0_processed[0])

['sadly']

In [13]:
train0_processed[:4], train1_processed[:4]

(['<ATTR_WORDS> sadly <CON_START> i was mistaken . <START> i was sadly mistaken . <END>',
  '<ATTR_WORDS> so run mill <CON_START> on to the hoagies , the italian is general of the . <START> so on to the hoagies , the italian is general run of the mill . <END>',
  '<ATTR_WORDS> minimal ton <CON_START> meat and a of shredded lettuce . <START> minimal meat and a ton of shredded lettuce . <END>',
  '<ATTR_WORDS> nothing really worthy _num_ <CON_START> special & not of the $ _ num _ price tag . <START> nothing really special & not worthy of the $ _num_ price tag . <END>'],
 ['<ATTR_WORDS> excellent <CON_START> food . <START> excellent food . <END>',
  '<ATTR_WORDS> superb <CON_START> customer service . <START> superb customer service . <END>',
  '<ATTR_WORDS> daily really good <CON_START> they also have specials and ice cream which is . <START> they also have daily specials and ice cream which is really good . <END>',
  "<ATTR_WORDS> 's good <CON_START> it ' a toasted hoagie . <START> it 's

In [14]:
# get content
train0_con = [get_train_content(x) for x in train0_processed]
train1_con = [get_train_content(x) for x in train1_processed]

In [15]:
train0_con[:4], train1_con[:4]

(['i was mistaken .',
  'on to the hoagies , the italian is general of the .',
  'meat and a of shredded lettuce .',
  'special & not of the $ _ num _ price tag .'],
 ['food .',
  'customer service .',
  'they also have specials and ice cream which is .',
  "it ' a toasted hoagie ."])

In [16]:
# Fatch attributes from the training data
attrs_neg = [get_train_attrs(x) for x in train0_processed]
attrs_pos = [get_train_attrs(x) for x in train1_processed]

In [17]:
# Get TFIDF vectors for Training and Reference
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(train0_con + train1_con)
conts_pos_vecs = conts_vecs[:len(train1_con)]
conts_neg_vecs = conts_vecs[len(train1_con):len(train1_con)+len(train0_con)]
conts_from_pos_ref_vecs = tfidf.transform(ref1_con)
conts_from_neg_ref_vecs = tfidf.transform(ref0_con)

#### AnnoyIndex is used to store the TFIDF vectors of training set and retrieve nearest neighbours of the reference content 

In [18]:
from annoy import AnnoyIndex

In [19]:
train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])

  train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
  train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])


In [20]:
# We have randomly selected training samples to control the memory usage
neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=50000, replace=False)
pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=50000, replace=False)

In [22]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(neg_idxs)):
    np_array = conts_neg_vecs[neg_idxs[i]].toarray()[0]
    train0_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [00:40<00:00, 1227.32it/s]


In [23]:
model_dir = "/home/jack/Desktop/NN/clean/models/"

train0_tree.build(50)
train0_tree.save(model_dir+'tfidf_train0.ann')

True

In [25]:
ref1_con[0:3], " ".join(attrs_neg[neg_idxs[0]])

(["it ' s yet they you feel at home .",
  'i will be going back and this place !',
  'the drinks were and a pour .'],
 'can ability decided')

In [27]:
with open(data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_1.txt", "w") as out_fp:
    for i in range(conts_from_pos_ref_vecs.shape[0]):
        x = conts_from_pos_ref_vecs[i].toarray()[0]
        inx,dis = train0_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref1_con[i]
        #ref_sen = processed_ref0[i].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","")
        #print(dis,"\t",ref0_org[i], "\t" ,train1_data[inx[0]], train1_attr[inx[0]])
        out_str = "<ATTR_WORDS> " + " ".join(attrs_neg[neg_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(out_str)
        out_fp.write(out_str)

<ATTR_WORDS> dining uncomfortable <CON_START> it ' s yet they you feel at home . <START>

<ATTR_WORDS> urge tell person <CON_START> i will be going back and this place ! <START>

<ATTR_WORDS> wrong <CON_START> the drinks were and a pour . <START>

<ATTR_WORDS> dog 's tests negative <CON_START> my husband got a ruben , he it . <START>

<ATTR_WORDS> staff bar falling <CON_START> i up for their email and a coupon . <START>

<ATTR_WORDS> cold killing <CON_START> i ' d giving them a try . <START>

<ATTR_WORDS> home depot <CON_START> i e & m painting . <START>

<ATTR_WORDS> no pregnant 's screw <CON_START> a experience and we will go again . <START>

<ATTR_WORDS> high low <CON_START> drinks , and company . <START>

<ATTR_WORDS> impressed dish <CON_START> i got my band back on now ! <START>

<ATTR_WORDS> reminds me 's farm <CON_START> i was nervous and she made me feel so and . <START>

<ATTR_WORDS> no nothing <CON_START> food recommendations steak and tuna were both . <START>

<ATTR_WORDS> d

In [28]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(pos_idxs)):
    np_array = conts_pos_vecs[pos_idxs[i]].toarray()[0]
    train1_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [00:41<00:00, 1210.92it/s]


In [29]:
model_dir = "/home/jack/Desktop/NN/clean/models/"
train1_tree.build(50)
train1_tree.save(model_dir+'tfidf_train1.ann')

True

In [31]:
with open(data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_0.txt", "w") as out_fp:
    for i in range(conts_from_neg_ref_vecs.shape[0]):
        x = conts_from_neg_ref_vecs[i].toarray()[0]
        inx,dis = train1_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref0_con[i]
        out_str = "<ATTR_WORDS> " + " ".join(attrs_pos[pos_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(i, out_str)
        out_fp.write(out_str)

0 <ATTR_WORDS> prompt friendly <CON_START> ever since joes has changed hands it ' just gotten and . <START>

1 <ATTR_WORDS> does does well <CON_START> there is not enough room in that part of the . <START>

2 <ATTR_WORDS> work patrick <CON_START> so basically down . <START>

3 <ATTR_WORDS> honestly talent talent <CON_START> she she ' d be back and for a few . <START>

4 <ATTR_WORDS> rest great <CON_START> i n ' t believe how inconsiderate this is . <START>

5 <ATTR_WORDS> cool slide <CON_START> just and it off the bill . <START>

6 <ATTR_WORDS> 's lot <CON_START> it is n ' t , but it is n ' t either . <START>

7 <ATTR_WORDS> love <CON_START> that i could not use my birthday gift ! <START>

8 <ATTR_WORDS> compassion amazing <CON_START> new , i - but i n ' t know the details . <START>

9 <ATTR_WORDS> great town <CON_START> but it probably ! <START>

10 <ATTR_WORDS> definitely continue <CON_START> we down and we got some really and service . <START>

11 <ATTR_WORDS> lovely <CON_START> the