## DATA PREP FOR RETRIEVE AND EDIT USING OPENAI-GPT

In [2]:
import argparse
import os
import csv
import random
import logging
from tqdm import tqdm, trange
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, OpenAIAdam
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import edit_distance
from random import shuffle
import re
import time
import pickle
from collections import defaultdict

In [21]:
DATA = '/home/ubuntu/data/amazon'
# DATA = '/home/ubuntu/data/yelp'

VOCAB_PATH = os.path.join(DATA, 'vocab')
ATTR_VOCAB_PATH = os.path.join(DATA, 'attribute_vocab')

POS_TRAIN_FILE_PATH = os.path.join(DATA, 'sentiment.train.1')
NEG_TRAIN_FILE_PATH = os.path.join(DATA,'sentiment.train.0')
POS_TRAIN_OUT_FILE_PATH = POS_TRAIN_FILE_PATH.replace('sentiment', 'processed.sentiment')
NEG_TRAIN_OUT_FILE_PATH = NEG_TRAIN_FILE_PATH.replace('sentiment', 'processed.sentiment')
COMBINED_TRAIN_OUT_FILE_PATH = os.path.join(DATA, 'processed.sentiment.train.0.and.1')
POS_REPLACE_ATTRS_PATH = os.path.join(DATA, 'attrs.replace.train.1')
NEG_REPLACE_ATTRS_PATH = os.path.join(DATA, 'attrs.replace.train.0')

POS_VAL_FILE_PATH = os.path.join(DATA, 'sentiment.dev.1')
NEG_VAL_FILE_PATH = os.path.join(DATA,'sentiment.dev.0')
POS_VAL_OUT_FILE_PATH = POS_VAL_FILE_PATH.replace('sentiment', 'processed.sentiment')
NEG_VAL_OUT_FILE_PATH = NEG_VAL_FILE_PATH.replace('sentiment', 'processed.sentiment')
COMBINED_VAL_OUT_FILE_PATH = os.path.join(DATA, 'processed.sentiment.dev.0.and.1')

POS_TEST_FILE_PATH = os.path.join(DATA, 'sentiment.test.1')
NEG_TEST_FILE_PATH = os.path.join(DATA,'sentiment.test.0')
POS_TEST_OUT_FILE_PATH = POS_TEST_FILE_PATH.replace('sentiment', 'processed.sentiment')
NEG_TEST_OUT_FILE_PATH = NEG_TEST_FILE_PATH.replace('sentiment', 'processed.sentiment')
POS_TEST_OUT_FILE_INPUTS_PATH = os.path.join(DATA, 'processed.sentiment.test.inputs.1')
NEG_TEST_OUT_FILE_INPUTS_PATH = os.path.join(DATA, 'processed.sentiment.test.inputs.0')
COMBINED_TEST_OUT_FILE_PATH = os.path.join(DATA, 'processed.sentiment.test.0.and.1')
COMBINED_TEST_OUT_FILE_INPUTS_PATH = os.path.join(DATA, 'processed.sentiment.test.inputs.0.and.1')

POS_REF_FILE_PATH = os.path.join(DATA, 'reference.1')
NEG_REF_FILE_PATH = os.path.join(DATA,'reference.0')
FROM_POS_REF_FILE_PATH = os.path.join(DATA, 'reference.from.1')
TO_NEG_REF_FILE_PATH = os.path.join(DATA,'reference.to.0')
FROM_NEG_REF_FILE_PATH = os.path.join(DATA, 'reference.from.0')
TO_POS_REF_FILE_PATH = os.path.join(DATA,'reference.to.1')
FROM_POS_REF_OUT_FILE_PATH = os.path.join(DATA, 'processed.reference.from.1')
FROM_NEG_REF_OUT_FILE_PATH = os.path.join(DATA, 'processed.reference.from.0')
FROM_POS_REF_OUT_FILE_INPUTS_PATH = os.path.join(DATA, 'processed.reference.inputs.from.1')
FROM_NEG_REF_OUT_FILE_INPUTS_PATH = os.path.join(DATA, 'processed.reference.inputs.from.0')

POS_TRAIN_ATTRS_PATH = os.path.join(DATA, 'attrs.train.1')
NEG_TRAIN_ATTRS_PATH = os.path.join(DATA, 'attrs.train.0')
POS_TRAIN_CONTS_PATH = os.path.join(DATA, 'conts.train.1')
NEG_TRAIN_CONTS_PATH = os.path.join(DATA, 'conts.train.0')

FROM_POS_REF_ATTRS_PATH = os.path.join(DATA, 'attrs.ref.from.1')
FROM_NEG_REF_ATTRS_PATH = os.path.join(DATA, 'attrs.ref.from.0')
FROM_POS_REF_CONTS_PATH = os.path.join(DATA, 'conts.ref.from.1')
FROM_NEG_REF_CONTS_PATH = os.path.join(DATA, 'conts.ref.from.0')

In [4]:
POS_NEG_TRAIN_FILE_PATH = os.path.join(DATA, 'sentiment.train.0.and.1')
TMP = os.path.join(DATA, 'tmp')
# SALIENCE = 5.5
SALIENCE = 15
LAMBDA = 1
!cat $POS_TRAIN_FILE_PATH $NEG_TRAIN_FILE_PATH > $POS_NEG_TRAIN_FILE_PATH
!python ./make_vcb.py $POS_NEG_TRAIN_FILE_PATH 40000 > $VOCAB_PATH
!sed '1,4d' $VOCAB_PATH > $TMP
!mv $TMP $VOCAB_PATH
!python ./make_attr_vcb.py $VOCAB_PATH $POS_TRAIN_FILE_PATH $NEG_TRAIN_FILE_PATH $SALIENCE $LAMBDA > $ATTR_VOCAB_PATH
                                                                                                                               

In [5]:
def split_reference_file(combined, left, right):
    f = open(combined, 'r')
    ref_combined = f.readlines()
    ref_left = [s.strip().split('\t')[0] for s in ref_combined]
    ref_right = [s.strip().split('\t')[1] for s in ref_combined]
    f.close()
    f = open(left, 'w')
    f.write('\n'.join(ref_left))
    f.close()
    f = open(right, 'w')
    f.write('\n'.join(ref_right))
    f.close()  
split_reference_file(POS_REF_FILE_PATH, FROM_POS_REF_FILE_PATH, TO_NEG_REF_FILE_PATH)
split_reference_file(NEG_REF_FILE_PATH, FROM_NEG_REF_FILE_PATH, TO_POS_REF_FILE_PATH)

In [6]:
spl_tokens = {'pos': '<POS>',
              'neg': '<NEG>',
              'cont_start':'<CONT_START>',
              'attr_start':'<ATTR_START>',
              'data_start':'<DATA_START>',
              'end': '<END>'}
spl_tokens

{'pos': '<POS>',
 'neg': '<NEG>',
 'cont_start': '<CONT_START>',
 'attr_start': '<ATTR_START>',
 'data_start': '<DATA_START>',
 'end': '<END>'}

In [7]:
attribute_vocab = []
with open(ATTR_VOCAB_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        attribute_vocab.append(line)

In [8]:
def extract_attribute(attribute_vocab, line):
    content = []
    attribute = []
    for token in line:
        if token in attribute_vocab:
            attribute.append(token)
        else:
            content.append(token)
    return content, attribute

In [9]:
def extract_data(reference_file_path):
    attrs = []
    conts = []
    count = 0
    outs = []
    with open(reference_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            count += 1
            line = line.strip()
            tokens = line.split(' ')
            cont, attr = extract_attribute(attribute_vocab, tokens)
            conts.append(' '.join(cont))
            attrs.append(' '.join(attr))
            if (count % 10000 == 0):
                print(count)
    return conts, attrs

In [10]:
conts_pos, attrs_pos = extract_data(POS_TRAIN_FILE_PATH)
conts_neg, attrs_neg = extract_data(NEG_TRAIN_FILE_PATH)
conts_from_pos_ref, attrs_from_pos_ref = extract_data(FROM_POS_REF_FILE_PATH)
conts_from_neg_ref, attrs_from_neg_ref = extract_data(FROM_NEG_REF_FILE_PATH)

def write_list_to_file(l, file):
    f = open(file, 'w')
    for i in l:
        f.write(i+'\n')
    f.close()
    return

write_list_to_file(conts_pos, POS_TRAIN_CONTS_PATH)
write_list_to_file(conts_neg, NEG_TRAIN_CONTS_PATH)
write_list_to_file(attrs_pos, POS_TRAIN_ATTRS_PATH)
write_list_to_file(attrs_neg, NEG_TRAIN_ATTRS_PATH)

write_list_to_file(conts_from_pos_ref, FROM_POS_REF_CONTS_PATH)
write_list_to_file(conts_from_neg_ref, FROM_NEG_REF_CONTS_PATH)
write_list_to_file(attrs_from_pos_ref, FROM_POS_REF_ATTRS_PATH)
write_list_to_file(attrs_from_neg_ref, FROM_NEG_REF_ATTRS_PATH)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000


In [11]:
def read_list_from_file(file):
    f = open(file, 'r')
    l = [line.strip() for line in f]
    return l

attrs_pos = read_list_from_file(POS_TRAIN_ATTRS_PATH)
attrs_neg = read_list_from_file(NEG_TRAIN_ATTRS_PATH)
conts_pos = read_list_from_file(POS_TRAIN_CONTS_PATH)
conts_neg = read_list_from_file(NEG_TRAIN_CONTS_PATH)
data_pos = read_list_from_file(POS_TRAIN_FILE_PATH)
data_neg = read_list_from_file(NEG_TRAIN_FILE_PATH)

conts_from_pos_ref = read_list_from_file(FROM_POS_REF_CONTS_PATH)
conts_from_neg_ref = read_list_from_file(FROM_NEG_REF_CONTS_PATH)
attrs_from_pos_ref = read_list_from_file(FROM_POS_REF_ATTRS_PATH)
attrs_from_neg_ref = read_list_from_file(FROM_NEG_REF_ATTRS_PATH)
data_from_pos_ref = read_list_from_file(FROM_POS_REF_FILE_PATH)
data_from_neg_ref = read_list_from_file(FROM_NEG_REF_FILE_PATH)

In [12]:
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(conts_pos+conts_neg)
conts_pos_vecs = conts_vecs[:len(conts_pos)]
conts_neg_vecs = conts_vecs[len(conts_pos):len(conts_pos)+len(conts_neg)]
conts_from_pos_ref_vecs = tfidf.transform(conts_from_pos_ref)
conts_from_neg_ref_vecs = tfidf.transform(conts_from_neg_ref)

In [13]:
def calc_closest_content(l, conts_ref_vecs, conts_vecs, device='cpu'):
    idxs = None
    if l == conts_vecs.shape[0]:
        idxs = np.array(range(l))
    else:
        idxs = np.random.choice(conts_vecs.shape[0], size=l, replace=False)
    conts_rand_vecs = conts_vecs[idxs, :]
    conts_rand_tensors = torch.tensor(conts_rand_vecs.todense())
    conts_ref_tensors = torch.tensor(conts_ref_vecs.todense())
    if device == 'gpu':
        conts_rand_tensors = conts_rand_tensors.cuda()
        conts_ref_tensors = conts_ref_tensors.cuda()
    scores = torch.mm(conts_rand_tensors, conts_ref_tensors.transpose(0,1)).squeeze()
    scores = scores.data.cpu().numpy()
    ind_maxs = None
    if len(scores.shape)>1:
        ind_maxs = np.argmax(scores, axis=0)
    else:
        ind_maxs = np.array([np.argmax(scores)]) 
    closest = [idxs[ind] for ind in ind_maxs]
    return closest

s = time.time()
attrs_tgt_from_pos_ref = calc_closest_content(50000, conts_from_pos_ref_vecs, conts_neg_vecs)
attrs_tgt_from_neg_ref = calc_closest_content(50000, conts_from_neg_ref_vecs, conts_pos_vecs)
print(time.time()-s)
neg_tgt_attrs_ref_from_pos = [attrs_neg[i] for i in attrs_tgt_from_pos_ref]
pos_tgt_attrs_ref_from_neg = [attrs_pos[i] for i in attrs_tgt_from_neg_ref]

17.932584762573242


In [58]:
def visualize_replacements():
    print("FROM POSITIVE:")
    for i in range(len(attrs_tgt_from_pos_ref)):
        print(conts_from_pos_ref[i])
        print(attrs_from_pos_ref[i])
        print(conts_neg[attrs_tgt_from_pos_ref[i]])
        print(attrs_neg[attrs_tgt_from_pos_ref[i]])
        print('==========')
        
    print("FROM NEGATIVE:")
    for i in range(len(attrs_tgt_from_neg_ref)):
        print(conts_from_neg_ref[i])
        print(attrs_from_neg_ref[i])
        print(conts_pos[attrs_tgt_from_neg_ref[i]])
        print(attrs_pos[attrs_tgt_from_neg_ref[i]])
        print('==========')
# visualize_replacements()

In [14]:
def prepare_ref_data_file(conts_ref, tgt_attrs_ref, data, output_file_path):
    count = 0
    outs = []
    for i in range(len(conts_ref)):
        count += 1
        ipstr = spl_tokens['attr_start'] + ' ' +  tgt_attrs_ref[i] + ' ' + spl_tokens['cont_start'] + ' ' + conts_ref[i]  + ' ' + spl_tokens[
            'data_start'] + ' ' + data[i] + ' '+ spl_tokens['end'] + "\n"
        ipstr = ipstr.replace('  ', ' ')
        outs.append(ipstr)
        if (count % 100 == 0):
            print(count)
    out_file = open(output_file_path, 'w', encoding='utf-8')
    for i in outs:
        out_file.write(i)
    out_file.close()
prepare_ref_data_file(conts_from_pos_ref, neg_tgt_attrs_ref_from_pos, data_from_pos_ref, FROM_POS_REF_OUT_FILE_PATH)
prepare_ref_data_file(conts_from_neg_ref, pos_tgt_attrs_ref_from_neg, data_from_neg_ref, FROM_NEG_REF_OUT_FILE_PATH)

100
200
300
400
500
100
200
300
400
500


In [59]:
def calc_closest_attr(i):
    idxs = np.random.randint(0, attrs_pos_vecs.shape[0], size=10000)
    attrs_pos_rand_vecs = attrs_pos_vecs[idxs, :]
    attrs_pos_rand_tensors = torch.tensor(attrs_pos_rand_vecs.todense()).cuda()
    attrs_pos_curr_vec = attrs_pos_vecs[i]
    attrs_pos_curr_tensor = torch.tensor(attrs_pos_curr_vec.todense()).cuda()
    scores = torch.mm(attrs_pos_rand_tensors, attrs_pos_curr_tensor.transpose(0,1)).squeeze().data.cpu().numpy()
    return np.argsort(scores)[::-1], [scores[k] for k in np.argsort(scores)[::-1]]
    
# for ind, _ in enumerate(attrs_pos_vecs):
#     if attrs_pos[ind]:
#         ret, scores = calc_closest_attr(ind)
#         print(conts_pos[ind])
#         print(attrs_pos[ind])
#         print(ret.shape)
#         print('=========')
#         for i, sc in zip(ret[:100],scores[:100]):
#             if attrs_pos[i]:
#                 print(sc)
#                 print(conts_pos[i])
#                 print(repr(attrs_pos[i]))
#                 print('===')
#         break

In [21]:
def calc_closest_attr_edit_distance(attr, attrs_all):
    idxs = np.random.randint(0, len(attrs_all), size=10000)
    ret = None
    for ind in idxs:
        a_r = attrs_all[ind]
        if edit_distance.SequenceMatcher(a=a_r.split(), b=attr.split()).distance()==1:
            return ind
    return ret

In [22]:
def prepare_train_data_file(conts, attrs, data, output_file_path, attr_replacements_file, sample_rate):
    outs = []
    repls = []
    c_empty_attr = 0
    c_noisy = 0
    count = 0
    for i, (cont, attr) in enumerate(zip(conts, attrs)):
        count += 1
        attr_noisy = attr
        noisy_ind = None
        if not attr:
            c_empty_attr+=1
        else:
            noisy_ind = calc_closest_attr_edit_distance(attr, attrs)
        if noisy_ind:
            if random.random() < sample_rate:
                attr_noisy = attrs[noisy_ind]
                c_noisy+=1
                replstr = (attr + '\t' + attr_noisy + '\n')
                repls.append(replstr)
        ipstr = spl_tokens['attr_start'] + ' ' + attr_noisy + ' ' + spl_tokens['cont_start'] + ' ' + cont  + \
            ' ' + spl_tokens['data_start'] + ' ' + data[i] + ' '+ spl_tokens['end'] + "\n"
        ipstr = ipstr.replace('  ', ' ')
        outs.append(ipstr)
        if (count % 10000 == 0):
            print(count)

    out_file = open(output_file_path, 'w', encoding='utf-8')
    for i in outs:
        out_file.write(i)
    out_file.close()

    repl_file = open(attr_replacements_file, 'w', encoding='utf-8')
    for i in repls:
        repl_file.write(i)
    repl_file.close()
    
    return c_noisy, count, c_empty_attr

In [23]:
SAMPLE_RATE = 0.1
c_noisy_pos, count_pos, count_empty_arr_pos = prepare_train_data_file(conts_pos, attrs_pos, data_pos, POS_TRAIN_OUT_FILE_PATH, POS_REPLACE_ATTRS_PATH, SAMPLE_RATE)
c_noisy_neg, count_neg, count_empty_arr_neg = prepare_train_data_file(conts_neg, attrs_neg, data_neg, NEG_TRAIN_OUT_FILE_PATH, NEG_REPLACE_ATTRS_PATH, SAMPLE_RATE)
print(c_noisy_pos, count_pos, count_empty_arr_pos, c_noisy_neg, count_neg, count_empty_arr_neg)

#YELP:
#17802 266041 87371 9252 177218 84265

#AMAZON:
#(9819, 277228, 172062, 10760, 277769, 142807)
#(9859, 277228, 172062, 10604, 277769, 142807)
# empty percentages - pos and neg - (0.6206515936341207, 0.5141214462377011)
# percentage samples noised by first sampling and then edit distance - pos and neg - (0.035418500295785416, 0.03873722409628144)
# percentage samples noised by first sampling and then edit distance - pos and neg - (0.03556278586578556, 0.03817560634916063)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
17802 266041 87371 9252 177218 84265


In [24]:
def prepare_data_file(reference_file_path, output_file_path, attrs_path = None):
    train_attrs = []
    count = 0
    outs = []
    with open(reference_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            count += 1
            line = line.strip()
            tokens = line.split(' ')
            cont, attr = extract_attribute(attribute_vocab, tokens)
            ipstr = spl_tokens['attr_start'] + ' ' + ' '.join(attr)  + ' ' + spl_tokens['cont_start'] + ' ' + ' '.join(cont)  + ' ' + spl_tokens[
                'data_start'] + ' ' + line + ' '+ spl_tokens['end'] + "\n"
            ipstr = ipstr.replace('  ', ' ')
            outs.append(ipstr)
            if attrs_path:
                train_attrs.append(' '.join(attr)+"\n")
            if (count % 10000 == 0):
                print(count)
    out_file = open(output_file_path, 'w', encoding='utf-8')
    for i in outs:
        out_file.write(i)
    out_file.close()
    if attrs_path:
        out_file = open(conts_path, 'w', encoding='utf-8')
        for i in train_conts:
            out_file.write(i)
        out_file.close()
prepare_data_file(POS_VAL_FILE_PATH, POS_VAL_OUT_FILE_PATH)
prepare_data_file(NEG_VAL_FILE_PATH, NEG_VAL_OUT_FILE_PATH)
prepare_data_file(POS_TEST_FILE_PATH, POS_TEST_OUT_FILE_PATH)
prepare_data_file(NEG_TEST_FILE_PATH, NEG_TEST_OUT_FILE_PATH)

In [25]:
def remove_targets(file_path, out_file_with_input_only_path):
    f = open(file_path, 'r')
    test_lines = f.readlines()
    f.close()
    test_inputs = [re.sub(r'<DATA_START>.+<END>\n', '', s)+'<DATA_START>' for s in test_lines]
    fout = open(out_file_with_input_only_path, 'w')
    fout.write('\n'.join(test_inputs))
    fout.close()

In [26]:
remove_targets(POS_TEST_OUT_FILE_PATH, POS_TEST_OUT_FILE_INPUTS_PATH)
remove_targets(NEG_TEST_OUT_FILE_PATH, NEG_TEST_OUT_FILE_INPUTS_PATH)
remove_targets(FROM_POS_REF_OUT_FILE_PATH, FROM_POS_REF_OUT_FILE_INPUTS_PATH)
remove_targets(FROM_NEG_REF_OUT_FILE_PATH, FROM_NEG_REF_OUT_FILE_INPUTS_PATH)

In [27]:
def combine_and_shuffle(pos_file_path, neg_file_path, combined_file_path):
    fp = open(pos_file_path, 'r', encoding='utf-8')
    fn = open(neg_file_path, 'r', encoding='utf-8')
    out = fp.readlines() + fn.readlines()
    shuffle(out)
    fo = open(combined_file_path, 'w', encoding='utf-8')
    for i in out:
        fo.write(i)
    fo.close()
combine_and_shuffle(POS_TRAIN_OUT_FILE_PATH, NEG_TRAIN_OUT_FILE_PATH, COMBINED_TRAIN_OUT_FILE_PATH)
combine_and_shuffle(POS_VAL_OUT_FILE_PATH, NEG_VAL_OUT_FILE_PATH, COMBINED_VAL_OUT_FILE_PATH)
combine_and_shuffle(POS_TEST_OUT_FILE_PATH, NEG_TEST_OUT_FILE_PATH, COMBINED_TEST_OUT_FILE_PATH)
combine_and_shuffle(POS_TEST_OUT_FILE_INPUTS_PATH, NEG_TEST_OUT_FILE_INPUTS_PATH, COMBINED_TEST_OUT_FILE_INPUTS_PATH)

## PERFORMANCE CALCULATIONS

In [22]:
import os
from torchnlp.metrics import get_moses_multi_bleu
OUT_DATA = '/home/ubuntu/git-repos/pytorch-pretrained-BERT/runs/amazon_1_plus_3_epoch'
# OUT_DATA = '/home/ubuntu/git-repos/pytorch-pretrained-BERT/runs/yelp_3_epoch'
POS_TEST_FILE_OUT_PATH = os.path.join(OUT_DATA, 'sentiment.test.out.1')
NEG_TEST_FILE_OUT_PATH = os.path.join(OUT_DATA, 'sentiment.test.out.0')
TO_NEG_REF_FILE_OUT_PATH = os.path.join(OUT_DATA, 'sentiment.reference.out.from.1')
TO_POS_REF_FILE_OUT_PATH = os.path.join(OUT_DATA, 'sentiment.reference.out.from.0')
NO_COMMON_WORDS_TO_NEG_REF = os.path.join(OUT_DATA, 'sentiment.reference.no_common.from.1')
NO_COMMON_WORDS_TO_POS_REF = os.path.join(OUT_DATA, 'sentiment.reference.no_common.from.0')
NO_COMMON_WORDS_POS_TEST = os.path.join(OUT_DATA, 'sentiment.test.no_common.1')
NO_COMMON_WORDS_NEG_TEST = os.path.join(OUT_DATA, 'sentiment.test.no_common.0')

In [23]:
def get_BLEU(preds_file, targets_file):
    hypotheses = []
    reference = []
    with open(preds_file) as fp1: # Rename file path with test results
        hypotheses = fp1.readlines()
    with open(targets_file) as fp1: # Path of the reference file
        reference = fp1.readlines()

    reference = list(map(lambda x: x.strip(), reference))
    hypotheses=list(map(lambda x: x.strip().replace('<END>',''), hypotheses))
    return get_moses_multi_bleu(hypotheses, reference, lowercase=True)
print('Data from : {} and {}'.format(OUT_DATA,DATA))
print("Positive test pred - src:", get_BLEU(POS_TEST_FILE_OUT_PATH, POS_TEST_FILE_PATH))
print("Negative test pred - src:", get_BLEU(NEG_TEST_FILE_OUT_PATH, NEG_TEST_FILE_PATH))
print("Positive to Negative reference pred - tgt:", get_BLEU(TO_NEG_REF_FILE_OUT_PATH, TO_NEG_REF_FILE_PATH))
print("Negative to Positive reference pred - tgt:", get_BLEU(TO_POS_REF_FILE_OUT_PATH, TO_POS_REF_FILE_PATH))
print("Positive to Negative reference tgt - src:", get_BLEU(TO_NEG_REF_FILE_PATH, FROM_POS_REF_FILE_PATH))
print("Negative to Positive reference tgt - src:", get_BLEU(TO_POS_REF_FILE_PATH, FROM_NEG_REF_FILE_PATH))
print("Positive to Negative reference pred - src:", get_BLEU(TO_NEG_REF_FILE_OUT_PATH, FROM_POS_REF_FILE_PATH))
print("Negative to Positive reference pred - src:", get_BLEU(TO_POS_REF_FILE_OUT_PATH, FROM_NEG_REF_FILE_PATH))

Data from : /home/ubuntu/git-repos/pytorch-pretrained-BERT/runs/amazon_1_plus_3_epoch and /home/ubuntu/data/amazon
Positive test pred - src: 92.3
Negative test pred - src: 91.21
Positive to Negative reference pred - tgt: 38.55
Negative to Positive reference pred - tgt: 35.13
Positive to Negative reference tgt - src: 47.6
Negative to Positive reference tgt - src: 43.78
Positive to Negative reference pred - src: 80.84
Negative to Positive reference pred - src: 81.28


In [82]:
def remove_common_parts(source_file, pred_file, target_file=None, out_file=None):
    srcs = open(source_file).readlines()
    preds = open(pred_file).readlines()
    tgts = None
    if target_file:
        tgts = open(target_file).readlines()
    srcs_out = []
    preds_out = []
    tgts_out = []
    for i in range(len(srcs)):
        common = set(srcs[i].split()) & set(preds[i].split())
        if target_file:
            common = common & set(tgts[i].split())
        srcs_out.append(' '.join([wrd for wrd in srcs[i].split() if wrd not in common]))
        preds_out.append(' '.join([wrd for wrd in preds[i].split() if wrd not in common]))
        if target_file:
            tgts_out.append(' '.join([wrd for wrd in tgts[i].split() if wrd not in common]))
    f = open(out_file, 'w')
    for i in range(len(srcs)):
        f.write('SRC : '+srcs[i])
        f.write('PRED: '+preds[i])
        if target_file:
            f.write('TGT : '+tgts[i])
        f.write('--------------\n')
        f.write('SRC_UNIQUE : '+srcs_out[i]+'\n')
        f.write('PRED_UNIQUE: '+preds_out[i]+'\n')
        if target_file:
            f.write('TGT_UNIQUE : '+tgts_out[i]+'\n')
        f.write('==============\n')
    f.close()

In [83]:
remove_common_parts(POS_TEST_FILE_PATH, POS_TEST_FILE_OUT_PATH, None, NO_COMMON_WORDS_POS_TEST)
remove_common_parts(NEG_TEST_FILE_PATH, NEG_TEST_FILE_OUT_PATH, None, NO_COMMON_WORDS_NEG_TEST)
remove_common_parts(FROM_POS_REF_FILE_PATH, TO_NEG_REF_FILE_OUT_PATH, TO_NEG_REF_FILE_PATH, NO_COMMON_WORDS_TO_NEG_REF)
remove_common_parts(FROM_NEG_REF_FILE_PATH, TO_POS_REF_FILE_OUT_PATH, TO_POS_REF_FILE_PATH, NO_COMMON_WORDS_TO_POS_REF)

## DATA PREP FOR TRAINING DEEPMOJI

In [None]:
import pickle
from collections import defaultdict
import os
import random

In [29]:
# DATA = '/home/ubuntu/data/amazon'
DATA = '/home/ubuntu/data/yelp'
POS_TRAIN_FILE_PATH = os.path.join(DATA, 'sentiment.train.1')
NEG_TRAIN_FILE_PATH = os.path.join(DATA,'sentiment.train.0')
POS_VAL_FILE_PATH = os.path.join(DATA, 'sentiment.dev.1')
NEG_VAL_FILE_PATH = os.path.join(DATA,'sentiment.dev.0')
POS_TEST_FILE_PATH = os.path.join(DATA, 'sentiment.test.1')
NEG_TEST_FILE_PATH = os.path.join(DATA,'sentiment.test.0')
data = defaultdict(list)

In [30]:
pos_data = open(POS_TRAIN_FILE_PATH, 'r').readlines()
neg_data = open(NEG_TRAIN_FILE_PATH, 'r').readlines()
pos_val_data = open(POS_VAL_FILE_PATH, 'r').readlines()
neg_val_data = open(NEG_VAL_FILE_PATH, 'r').readlines()
pos_test_data = open(POS_TEST_FILE_PATH, 'r').readlines()
neg_test_data = open(NEG_TEST_FILE_PATH, 'r').readlines()

In [31]:
pos = [(i.strip(), {'label':1}) for i in pos_data] 
neg = [(i.strip(), {'label':0}) for i in neg_data]
pos_val = [(i.strip(), {'label':1}) for i in pos_val_data] 
neg_val = [(i.strip(), {'label':0}) for i in neg_val_data]
pos_test= [(i.strip(), {'label':1}) for i in pos_test_data] 
neg_test = [(i.strip(), {'label':0}) for i in neg_test_data]
pos_neg = pos+neg
pos_neg_val = pos_val+neg_val
pos_neg_test = pos_test+neg_test
random.shuffle(pos_neg)
random.shuffle(pos_neg_val)
random.shuffle(pos_neg_test)

In [32]:
data[str('texts')] = [i[0] for i in pos_neg]+[i[0] for i in pos_neg_val]+[i[0] for i in pos_neg_test]
data[str('info')] = [i[1] for i in pos_neg]+[i[1] for i in pos_neg_val]+[i[1] for i in pos_neg_test]
data[str('train_ind')] = list(range(len(pos_neg)))
data[str('val_ind')] = list(range(len(pos_neg),len(pos_neg)+len(pos_neg_val)))
data[str('test_ind')] = list(range(len(pos_neg)+len(pos_neg_val),len(pos_neg)+len(pos_neg_val)+len(pos_neg_test)))

In [33]:
pickle.dump(dict(data), open('/home/ubuntu/raw.pickle', 'wb'), protocol=2)