In [1]:
from model import SentenceVAE
import pandas as pd
import random

In [68]:
# tokenize data 
def preprocess_and_tokenise(data):
    sk_word_tokenize = cv.build_tokenizer()
    sk_preprocesser = cv.build_preprocessor()
    tokenize = lambda doc: sk_word_tokenize(sk_preprocesser(doc))
    data_tokenised=[tokenize(dat) for dat in data]
    return data_tokenised


train_tokenised=preprocess_and_tokenise(train)
test_tokenised=preprocess_and_tokenise(test)
valid_tokenised=preprocess_and_tokenise(valid)


In [99]:
# create vocab from training data
cv = sklearn.feature_extraction.text.CountVectorizer(lowercase=True).fit(train)

vocab=dict(sorted(cv.vocabulary_.items(),key=lambda x: x[1]))
# +4 to all indexs to make way for special tokens
vocab={k:v+4 for k,v in vocab.items()}

# add special tokens
special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
for i,token in enumerate(special_tokens):
    vocab[token]=i

w2i=dict(sorted(vocab.items(),key=lambda x: x[1]))
i2w={i:w for w,i in w2i.items()}

print("Vocablurary of %i keys created." %len(w2i))

# store w2i and i2w in dict
vocab=dict(w2i=w2i, i2w=i2w)

Vocablurary of 8293 keys created.


In [129]:
# function to prepare inputs
from collections import defaultdict

def prepare_data(data,max_sequence_length=50):
    data_processed = defaultdict(dict)

    inputs=[['<sos>']+tokens for tokens in data]
    inputs=[tokens[:max_sequence_length] for tokens in inputs]

    targets=[tokens[:max_sequence_length-1] for tokens in data]
    targets=[tokens+['<eos>'] for tokens in targets]

    lengths=[len(tokens) for tokens in inputs]

    [inputs[i].extend(['<pad>'] * (max_sequence_length-lengths[i])) for i in range(len(inputs))]
    [targets[i].extend(['<pad>'] * (max_sequence_length-lengths[i])) for i in range(len(targets))]

    inputs=[[w2i.get(w,w2i['<unk>']) for w in tokens] for tokens in inputs]
    targets=[[w2i.get(w,w2i['<unk>']) for w in tokens] for tokens in targets]

    for i in range(len(data)):
        data_processed[i]['input']=inputs[i]
        data_processed[i]['target']=targets[i]
        data_processed[i]['length']=lengths[i]

    return data_processed

    
train_processed= prepare_data(train_tokenised)

In [None]:
from sklearn.model_selection import train_test_split

es_conv=pd.read_json('data/ESConv.json')
es_conv['supporter_utterances']=es_conv.dialog.apply(lambda x : process_dialogue(x)['supporter'])

supporter_utts=[utt for utts in es_conv.supporter_utterances.values.tolist() for utt in utts]
supporter_utts
train,test=train_test_split(supporter_utts,test_size=.15, random_state=42)
train,valid=train_test_split(supporter_utts, test_size=.15, random_state=42)

print("Train:", len(train))
print("Test:", len(test))
print("Validation:", len(valid))

In [158]:
from torch.utils.data import Dataset
import sklearn.feature_extraction.text 
import sklearn.preprocessing

import numpy as np


class ESCONV(Dataset):
    def __init__(self, data_dir='data/ESConv.json', load_split='train',  max_sequence_length=50):
        super().__init__()

        self.max_sequence_length=max_sequence_length
        self.es_conv=pd.read_json('data/ESConv.json')
       
        # i am only interested in supporter utterances for now
        self.es_conv['supporter_utterances']=es_conv.dialog.apply(lambda x : self._retrieve_utterances(x)['supporter'])
        supporter_utts=[utt for utts in es_conv.supporter_utterances.values.tolist() for utt in utts]

        train,test=train_test_split(supporter_utts,test_size=.15, random_state=42)
        train,valid=train_test_split(supporter_utts, test_size=.15, random_state=42)

        print("Train:", len(train))
        print("Test:", len(test))
        print("Validation:", len(valid))

        # fit cv on train
        self.cv = sklearn.feature_extraction.text.CountVectorizer(lowercase=True)
        self.cv.fit(train)

        self.splits={
            'train':self._preprocess_and_tokenise_data(train),
            'valid': self._preprocess_and_tokenise_data(valid),
            'test':self._preprocess_and_tokenise_data(test)
            }


        #initialise vocab
        self._create_vocab()

        self.data=self._prepare_data(split=load_split)
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input': np.asarray(self.data[idx]['input']),
            'target': np.asarray(self.data[idx]['target']),
            'length': self.data[idx]['length']
        }

    def _preprocess_and_tokenise_data(self,data):
        sk_word_tokenize = self.cv.build_tokenizer()
        sk_preprocesser = self.cv.build_preprocessor()
        tokenize = lambda doc: sk_word_tokenize(sk_preprocesser(doc))
        data_tokenised=[tokenize(dat) for dat in data]
        
        return data_tokenised

    def _retrieve_utterances(self, dialog):
        """
        takes a dialog, returns a speaker:[utt list] dict 
        """

        prev_speaker=None
        curr_speaker=None
    
        all_utterances=dict(zip(['seeker','supporter'],[[] for _ in range(2)]))

        for i,item in enumerate(dialog):
            prev_speaker=curr_speaker

            curr_speaker=item['speaker'].strip()
            curr_utt=item['content'].strip()
            
            if curr_speaker==prev_speaker:  # concat curr utterance to previous utterance"
                all_utterances[curr_speaker][-1]=f"{all_utterances[curr_speaker][-1]}. {curr_utt}"

            else:
                all_utterances[curr_speaker].append(curr_utt)
        
        #assert(len(all_utterances['seeker'])==len(all_utterances['supporter']))

        return all_utterances


    def _prepare_data(self,split='train'):
        data=self.splits[split]
        
        data_processed = defaultdict(dict)
        inputs=[['<sos>']+tokens for tokens in data]
        inputs=[tokens[:self.max_sequence_length] for tokens in inputs]

        targets=[tokens[:self.max_sequence_length-1] for tokens in data]
        targets=[tokens+['<eos>'] for tokens in targets]

        lengths=[len(tokens) for tokens in inputs]

        [inputs[i].extend(['<pad>'] * (self.max_sequence_length-lengths[i])) for i in range(len(inputs))]
        [targets[i].extend(['<pad>'] * (self.max_sequence_length-lengths[i])) for i in range(len(targets))]

        inputs=[[self.w2i.get(w,self.w2i['<unk>']) for w in tokens] for tokens in inputs]
        targets=[[self.w2i.get(w,self.w2i['<unk>']) for w in tokens] for tokens in targets]

        for i in range(len(data)):
            data_processed[i]['input']=inputs[i]
            data_processed[i]['target']=targets[i]
            data_processed[i]['length']=lengths[i]
        
        return data_processed
    
    def _create_vocab(self):
        # create vocab from trainset
        self.vocab=dict(sorted(self.cv.vocabulary_.items(),key=lambda x: x[1]))
        # +4 to all indexs to make way for special tokens
        self.vocab={k:v+4 for k,v in self.vocab.items()}

        # add special tokens
        self.special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for i,token in enumerate(special_tokens):
            self.vocab[token]=i

        self.w2i=dict(sorted(self.vocab.items(),key=lambda x: x[1]))
        self.i2w={i:w for w,i in w2i.items()}

        # store w2i and i2w in dict
        self.vocab=dict(w2i=w2i, i2w=i2w)
    
    @property
    def vocab_size(self):
        return len(self.w2i)

    @property
    def pad_idx(self):
        return self.w2i['<pad>']

    @property
    def sos_idx(self):
        return self.w2i['<sos>']

    @property
    def eos_idx(self):
        return self.w2i['<eos>']

    @property
    def unk_idx(self):
        return self.w2i['<unk>']

    def get_w2i(self):
        return self.w2i

    def get_i2w(self):
        return self.i2w

trainset=ESCONV(load_split='train')
valset=ESCONV(load_split='valid')
testset=ESCONV(load_split='test')

Train: 13026
Test: 2299
Validation: 2299
Train: 13026
Test: 2299
Validation: 2299
Train: 13026
Test: 2299
Validation: 2299


In [164]:
datasets=dict(zip(['train', 'valid','test'],[trainset,valset,testset]))

In [165]:
from model import SentenceVAE

params = dict(
    vocab_size=datasets['train'].vocab_size,
    sos_idx=datasets['train'].sos_idx,
    eos_idx=datasets['train'].eos_idx,
    pad_idx=datasets['train'].pad_idx,
    unk_idx=datasets['train'].unk_idx,
    max_sequence_length=50,
    embedding_size=300,
    rnn_type='gru',
    hidden_size=256,
    word_dropout=0,
    embedding_dropout=.5,
    latent_size=16,
    num_layers=1,
    bidirectional=True
)

model=SentenceVAE(**params)

In [170]:
from torch.utils.data import DataLoader
import torch
from multiprocessing import cpu_count


data_loader = DataLoader(
    dataset=datasets['train'],
    batch_size=16,
    shuffle=True,
    pin_memory=torch.cuda.is_available()
    )

batch=next(iter(data_loader))

In [172]:
model.train()

model(batch['input'], batch['length'])

(tensor([[[-9.1588, -8.6968, -8.9289,  ..., -8.7510, -9.2895, -8.9929],
          [-9.5271, -9.0473, -9.0330,  ..., -9.1774, -9.0350, -8.8178],
          [-9.3848, -8.9214, -9.2440,  ..., -8.9341, -9.2758, -9.0847],
          ...,
          [-9.0099, -9.0418, -9.0322,  ..., -9.0084, -9.0237, -9.0088],
          [-9.0099, -9.0418, -9.0322,  ..., -9.0084, -9.0237, -9.0088],
          [-9.0099, -9.0418, -9.0322,  ..., -9.0084, -9.0237, -9.0088]],
 
         [[-9.4589, -8.9679, -8.8549,  ..., -9.1017, -8.9892, -8.9292],
          [-9.1719, -9.0426, -9.1878,  ..., -9.0510, -8.6918, -9.1229],
          [-8.8628, -8.8871, -9.0365,  ..., -9.2286, -8.9645, -8.7401],
          ...,
          [-9.0099, -9.0418, -9.0322,  ..., -9.0084, -9.0237, -9.0088],
          [-9.0099, -9.0418, -9.0322,  ..., -9.0084, -9.0237, -9.0088],
          [-9.0099, -9.0418, -9.0322,  ..., -9.0084, -9.0237, -9.0088]],
 
         [[-8.8912, -8.9500, -8.8911,  ..., -8.6639, -9.4993, -9.4273],
          [-8.5864, -8.9749,