In [2]:
import numpy as np
import pandas as pd

In [3]:
captions = np.load('data/mscoco/all_captions.npy')
captions = [c[:5] for c in captions]
sentences = []
for c in captions:
    sentences += c

In [None]:
# build infersent
import torch as t
import sys
sys.path.append('../InferSent/encoder')

INFERSENT_PATH = '../InferSent/encoder/'
GLOVE_PATH = '/home/aleksey.zotov/InferSent/dataset/GloVe/glove.840B.300d.txt'

infersent_model = t.load(INFERSENT_PATH + 'infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# infersent_model = t.load(INFERSENT_PATH + 'infersent.allnli.pickle', map_location={'cuda:1' : 'cuda:0', 'cuda:2' : 'cuda:0'})
# infersent_model = t.load(INFERSENT_PATH + 'infersent.allnli.pickle')
infersent_model.set_glove_path(GLOVE_PATH)
infersent_model.build_vocab_k_words(K=200000)

# infersent embs
def get_infersent_emb(X):
    embeddings = infersent_model.encode(X, bsize=64, tokenize=False, verbose=False)
    return embeddings

In [5]:
import torch as t
import utils
from utils.batch_loader import BatchLoader
from model.parameters import Parameters
from model.paraphraser import Paraphraser

class Args(object):
    pass

args = Args()
args.batch_size = 32
args.seq_len=30
args.use_cuda = True

args.use_cuda = True
args.model_name = 'snli_200kit_600_800'

datasets = set()
datasets.add('snli')


sentences = np.array([utils.batch_loader.clean_str(s) for s in sentences])
captions = [[utils.batch_loader.clean_str(s) for s in bucket] for bucket in captions]

batch_loader = BatchLoader(path='', datasets=datasets)
batch_loader.build_input_vocab(sentences)

parameters = Parameters(batch_loader.max_seq_len, batch_loader.vocab_size)
paraphraser = Paraphraser(parameters)

paraphraser.load_state_dict(t.load('saved_models/trained_paraphraser_' + args.model_name))
if args.use_cuda:
    paraphraser = paraphraser.cuda()


SNLI: train: 183416, test: 3368
ALL: train: 183416, test: 3368
Found 20514(/21730) words with glove vectors
Vocab size : 20514
Found 30481(/26837) words with glove vectors
Vocab size : 30481


In [157]:
# PARAPHRASER EXPAND
def paraphrases_from_sources(sources, use_mean=True):
    result = []
    for s1,s2 in zip(sources[0], sources[1]):
        input = batch_loader.input_from_sentences([[s1],[s2]])
        input = [var.cuda() if args.use_cuda else var for var in input]
        result += [paraphraser.sample_with_input(batch_loader, args.seq_len, args.use_cuda, use_mean, input)]
    
    return result

def paraphrase_expand(X, y, n_paraphrases, pure_paraphrases):
    if n_paraphrases <= 0:
        return X,y
    
    X_gen, y_gen = [], []
    y = np.array(y)
    X = np.array(X)
    for class_id in np.unique(y):
        X_class = X[y == class_id]
        idx = []
        for i in range(len(X_class)):
            for j in range(len(X_class)):
                if i == j and len(X_class) != 1:
                    continue
                idx.append((i,j))
                
        idx = np.array(idx)
        idx = idx[np.random.choice(list(range(len(idx))), n_paraphrases, replace=False)]
        sources = [X_class[idx[:,0]], X_class[idx[:,1]]]
        X_gen += [paraphrases_from_sources(sources)]
        y_gen += [[class_id] * n_paraphrases]
    
    if pure_paraphrases:
        return np.concatenate(X_gen), np.concatenate(y_gen)
    
    X_result = np.concatenate([X] + X_gen)
    y_result = np.concatenate([y] + y_gen)
    
    return X_result, y_result

# Some paraphrasing samples

In [7]:
paraphrases_from_sources([
    ['Woman sits near the table with her dog'],
    ['Very old woman is sitting with her child on the table']], use_mean=False)

[' a blond woman in a black sweater sitting at the table with her dog']

In [9]:
paraphrases_from_sources([
    ['Woman sits near the table with her dog'],
    ['Very old woman is sitting with her child on the table']], use_mean=False)

[' an old woman is sitting on a bench with her dog in the background']

In [8]:
paraphrases_from_sources([
    ['man is chopping old wood with an axe'],
    ['very old man is outside']], use_mean=True)

[' older man is outside']

In [28]:
cid = 0
captions[cid][0], captions[cid][1], paraphrases_from_sources([[captions[cid][0]], [captions[cid][1]]], use_mean=True)[0]

('a bicycle replica with a clock as the front wheel',
 'the bike has a clock as a tire',
 ' the bicycle has a marker')

In [29]:
cid = 1
captions[cid][0], captions[cid][1], paraphrases_from_sources([[captions[cid][0]], [captions[cid][1]]], use_mean=True)[0]

('a room with blue walls and a white sink and door',
 'blue and white color scheme in a small bathroom',
 ' there is a room covered in a room')

In [30]:
cid = 1337
captions[cid][0], captions[cid][1], paraphrases_from_sources([[captions[cid][0]], [captions[cid][1]]], use_mean=True)[0]

('a small white toilet sitting in a bathroom',
 'the toilet in the small stall has the lit up',
 ' small inside of the water with the mirror of the mirror')

In [64]:
# GLOVE EMB using batch loader
def get_glove_emb(sentence):
    emb_size = 300
    embed = np.zeros((len(sentence), emb_size))
    for i, s in enumerate(sentence.strip().split()):
        if s in batch_loader.word_vec.keys(): 
            embed[i, :] = batch_loader.word_vec[s]
        else:
            embed[i, :] = batch_loader.word_vec['null']
    return embed

In [159]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

def get_f1_score(y_true, y_pred):
    return precision_recall_fscore_support(y_true=np.array(y_true), y_pred=y_pred)[2]

def split_and_test(model_function, X_all, y_all, n_samples, n_paraphrases, pure_paraphrases=False):
    n_classes = len(np.unique(y_all))
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, stratify=y_all, train_size=n_samples * n_classes)
    X_train, y_train = paraphrase_expand(X_train, y_train, n_paraphrases, pure_paraphrases)
    predicted_proba = model_function(X_train, X_test, y_train)
    return np.mean(get_f1_score(y_test, predicted_proba))

In [66]:
from sklearn.linear_model import LogisticRegression
def bag_of_words_function(X_train, X_test, y_train):
    def get_avg_emb(X):
        return np.array([get_glove_emb(x).mean(axis=0) for x in X])
        
    X_train = get_avg_emb(X_train)
    X_test = get_avg_emb(X_test)
    classifier = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=100, n_jobs=-1)
    classifier.fit(X_train, y_train)
    return classifier.predict(X_test)

In [67]:
def build_xy_sampled(n_classes):
    assert n_classes <= len(captions)
    sampled_captions_id = np.random.choice(list(range(len(captions))), size=n_classes, replace=False)
    x_all, y_all = [], []
    for i in sampled_captions_id:
        y_all += [i] * len(captions[i])
        for s in captions[i]:
            x_all.append(s)
    return x_all, y_all

In [68]:
import warnings
warnings.filterwarnings('ignore')

In [131]:
from tqdm import tqdm
def run(function, n_samples, n_paraphrases, n_classes, averaging_order, pure_paraphrases):
    result_f1 = []
    for i in tqdm(range(averaging_order), position=0):
        X,y = build_xy_sampled(n_classes)
        result_f1.append(split_and_test(function, X, y, n_samples, n_paraphrases, pure_paraphrases))
    return np.mean(result_f1), np.std(result_f1)

In [96]:
run(bag_of_words_function, n_samples=3, n_paraphrases=0, n_classes=30, averaging_order=30)

100%|██████████| 30/30 [00:08<00:00,  3.66it/s]


0.7288694885361552

In [97]:
run(bag_of_words_function, n_samples=3, n_paraphrases=1, n_classes=30, averaging_order=30)

100%|██████████| 30/30 [00:20<00:00,  1.45it/s]


0.7080379188712522

In [98]:
run(bag_of_words_function, n_samples=3, n_paraphrases=2, n_classes=30, averaging_order=30)

100%|██████████| 30/30 [00:33<00:00,  1.10s/it]


0.6958606701940034

In [99]:
run(bag_of_words_function, n_samples=3, n_paraphrases=3, n_classes=30, averaging_order=30)

100%|██████████| 30/30 [00:45<00:00,  1.52s/it]


0.6927680776014109

In [105]:
run(bag_of_words_function, n_samples=2, n_paraphrases=0, n_classes=30, averaging_order=30)

100%|██████████| 30/30 [00:05<00:00,  5.08it/s]


(0.683664656331323, 0.051144851062405926)

In [106]:
run(bag_of_words_function, n_samples=2, n_paraphrases=1, n_classes=30, averaging_order=30)

100%|██████████| 30/30 [00:20<00:00,  1.49it/s]


(0.6569659291325959, 0.06764993699409934)

In [103]:
run(bag_of_words_function, n_samples=1, n_paraphrases=0, n_classes=30, averaging_order=30)

100%|██████████| 30/30 [00:05<00:00,  5.41it/s]


(0.5827798489939012, 0.0555059242989261)

In [128]:
run(bag_of_words_function, n_samples=1, n_paraphrases=1, n_classes=30, averaging_order=30)

100%|██████████| 30/30 [00:18<00:00,  1.65it/s]


(0.5581126469467544, 0.059155015604891996)

In [161]:
run(bag_of_words_function, n_samples=3, n_paraphrases=1, n_classes=30, averaging_order=30, pure_paraphrases=True)

100%|██████████| 30/30 [00:17<00:00,  1.72it/s]


(0.4411393862129156, 0.09088604184889583)

In [160]:
run(bag_of_words_function, n_samples=3, n_paraphrases=2, n_classes=30, averaging_order=30, pure_paraphrases=True)

100%|██████████| 30/30 [00:30<00:00,  1.00s/it]


(0.5174571107904441, 0.066590618119258)

In [162]:
run(bag_of_words_function, n_samples=3, n_paraphrases=3, n_classes=30, averaging_order=30, pure_paraphrases=True)

100%|██████████| 30/30 [00:44<00:00,  1.47s/it]


(0.5627104870438203, 0.06960608459769967)

In [163]:
run(bag_of_words_function, n_samples=4, n_paraphrases=4, n_classes=30, averaging_order=30, pure_paraphrases=True)

100%|██████████| 30/30 [00:58<00:00,  1.95s/it]


(0.5604814814814815, 0.09339754551869299)

In [164]:
run(bag_of_words_function, n_samples=4, n_paraphrases=6, n_classes=30, averaging_order=30, pure_paraphrases=True)

100%|██████████| 30/30 [01:22<00:00,  2.76s/it]


(0.6011481481481481, 0.13231602129921918)

In [112]:
# run(bag_of_words_function, n_samples=2, n_paraphrases=1, n_classes=30, averaging_order=30)

In [119]:
paraphrases_from_sources([['a table with a quesadilla salad and drinks'],['a sliced pizza a salad and two milk shakes on a table']], use_mean=True)

[' a table filled with a spoon and a few empty plate at a table']

In [None]:
paraphrases_from_sources([['a table with a quesadilla salad and drinks'],['a sliced pizza a salad and two milk shakes on a table']], use_mean=True)

In [1]:
# !export CUDA_VISIBLE_DEVICES=1 
!echo $CUDA_VISIBLE_DEVICES

1
