In [1]:
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import os
import nltk
import string
from sympy.utilities.iterables import multiset_permutations

In [3]:
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')

Using cache found in C:\Users\bavik/.cache\torch\hub\pytorch_fairseq_master


In [4]:
data_dir = "./data/"
embed_size = 1024

In [5]:
requests = pd.read_table(data_dir + "train.tsv", sep = '\t', header = 0).drop_duplicates('topic_id')
df = pd.DataFrame(requests)
init_requests = df['initial_request'].to_numpy(dtype = str)
topic_ids = df['topic_id'].to_numpy(dtype = int)
clarification_need = df['clarification_need'].to_numpy(dtype = int)
req_data = [(topic_ids[i], init_requests[i], clarification_need[i]) for i in range(len(init_requests))]
req_data = np.array(req_data)
inputs = req_data[:,1]
labels = req_data[:,2]
topics = req_data[:,0]

In [6]:
inputs = np.array([req.lower() for req in inputs])
inputs = np.array([req.strip() for req in inputs])
inputs = np.array([req.translate(str.maketrans('', '', string.punctuation)) for req in inputs])
inputs = np.array([np.array(nltk.tokenize.word_tokenize(req)) for req in inputs])

In [7]:
inputs

array([array(['tell', 'me', 'about', 'obama', 'family', 'tree'], dtype='<U6'),
       array(['what', 'is', 'fickle', 'creek', 'farm'], dtype='<U6'),
       array(['tell', 'me', 'about', 'sonoma', 'county', 'medical', 'services'],
      dtype='<U8'),
       array(['tell', 'me', 'about', 'of', 'ralph', 'owen', 'brester'],
      dtype='<U7'),
       array(['im', 'looking', 'for', 'information', 'about', 'mayo', 'clinic',
       'jacksonville', 'fl'], dtype='<U12'),
       array(['how', 'to', 'prepare', 'for', 'the', 'gmat'], dtype='<U7'),
       array(['id', 'like', 'to', 'learn', 'about', 'lymphoma', 'in', 'dogs'],
      dtype='<U8'),
       array(['is', 'kenmore', 'heater', 'good'], dtype='<U7'),
       array(['tell', 'me', 'more', 'about', 'hp', 'mini', '2140'], dtype='<U5'),
       array(['tell', 'me', 'about', 'the', 'pacific', 'northwest', 'laboratory'],
      dtype='<U10'),
       array(['what', 'is', 'california', 'franchise', 'tax', 'board'],
      dtype='<U10'),
       array(['w

In [8]:
inputs_aug = []
labels_aug = []
for i in tqdm(range(len(inputs))):
    perms = list(multiset_permutations(inputs[i]))[:5]
    for p in perms:
        inputs_aug.append(np.array(p, dtype = np.str))
        labels_aug.append(labels[i])

HBox(children=(FloatProgress(value=0.0, max=187.0), HTML(value='')))




In [10]:
inputs_aug

[array(['about', 'family', 'me', 'obama', 'tell', 'tree'], dtype='<U6'),
 array(['about', 'family', 'me', 'obama', 'tree', 'tell'], dtype='<U6'),
 array(['about', 'family', 'me', 'tell', 'obama', 'tree'], dtype='<U6'),
 array(['about', 'family', 'me', 'tell', 'tree', 'obama'], dtype='<U6'),
 array(['about', 'family', 'me', 'tree', 'obama', 'tell'], dtype='<U6'),
 array(['creek', 'farm', 'fickle', 'is', 'what'], dtype='<U6'),
 array(['creek', 'farm', 'fickle', 'what', 'is'], dtype='<U6'),
 array(['creek', 'farm', 'is', 'fickle', 'what'], dtype='<U6'),
 array(['creek', 'farm', 'is', 'what', 'fickle'], dtype='<U6'),
 array(['creek', 'farm', 'what', 'fickle', 'is'], dtype='<U6'),
 array(['about', 'county', 'me', 'medical', 'services', 'sonoma', 'tell'],
       dtype='<U8'),
 array(['about', 'county', 'me', 'medical', 'services', 'tell', 'sonoma'],
       dtype='<U8'),
 array(['about', 'county', 'me', 'medical', 'sonoma', 'services', 'tell'],
       dtype='<U8'),
 array(['about', 'county', 

In [11]:
' '.join(inputs_aug[0])

'about family me obama tell tree'

In [12]:
inputs = [' '.join(req) for req in inputs_aug]
inputs

['about family me obama tell tree',
 'about family me obama tree tell',
 'about family me tell obama tree',
 'about family me tell tree obama',
 'about family me tree obama tell',
 'creek farm fickle is what',
 'creek farm fickle what is',
 'creek farm is fickle what',
 'creek farm is what fickle',
 'creek farm what fickle is',
 'about county me medical services sonoma tell',
 'about county me medical services tell sonoma',
 'about county me medical sonoma services tell',
 'about county me medical sonoma tell services',
 'about county me medical tell services sonoma',
 'about brester me of owen ralph tell',
 'about brester me of owen tell ralph',
 'about brester me of ralph owen tell',
 'about brester me of ralph tell owen',
 'about brester me of tell owen ralph',
 'about clinic fl for im information jacksonville looking mayo',
 'about clinic fl for im information jacksonville mayo looking',
 'about clinic fl for im information looking jacksonville mayo',
 'about clinic fl for im infor

In [13]:
inputs_enc = []
for req in inputs:
    inputs_enc.append(roberta.encode(req))

In [15]:
inputs_enc[3]

tensor([   0, 9006,  284,  162, 1137, 3907, 6168, 2583,    2])

In [14]:
roberta.extract_features(inputs)

tensor([[[-0.0109,  0.0153,  0.0858,  ...,  0.0126,  0.1339,  0.0440],
         [-0.0134,  0.1727, -0.1304,  ...,  0.0917, -0.0282,  0.1001],
         [ 0.6296, -0.4084, -0.0844,  ..., -0.2051,  0.1361,  1.3963],
         ...,
         [-0.0194,  0.0678,  0.5043,  ..., -0.3590,  1.5590,  1.0416],
         [ 0.0337,  0.1827, -0.0480,  ..., -0.2606,  0.3240,  0.4156],
         [ 0.0660, -0.0147,  0.1079,  ..., -0.0040,  0.1158,  0.0755]]],
       grad_fn=<TransposeBackward0>)

In [None]:
inputs_enc = [roberta.extract_features(input) for input in inputs_enc]

In [None]:
inputs_enc[0].shape

In [None]:
max_len = 0
for i in range(len(inputs_enc)):
    if inputs_enc[i].shape[1]>max_len:
        max_len = inputs_enc[i].shape[1]

In [None]:
max_len = 19

In [None]:
inputs_enc = [torch.squeeze(input) for input in inputs_enc]

In [None]:
inputs_enc_padded =  torch.zeros((len(inputs_enc),max_len,embed_size), dtype=float)

In [None]:
for i in tqdm(range(len(inputs_enc))):
    pad_length = max_len - inputs_enc[i].shape[0]
    inputs_enc_padded[i] = torch.cat((inputs_enc[i],torch.zeros((pad_length, embed_size))))

In [None]:
inputs_enc_padded

In [None]:
torch.save(inputs_enc_padded, 'train_embeds_roberta_augmented.pt')

In [None]:
labels_aug

In [None]:
np.save('labels_aug_roberta.npy', labels_aug)

In [None]:
df_dev = pd.read_table(data_dir + "dev.tsv", sep = '\t').drop_duplicates('topic_id')
init_requests_dev = df_dev['initial_request'].to_numpy(dtype = str)
topic_ids_dev = df_dev['topic_id'].to_numpy(dtype = int)
clarification_need_dev = df_dev['clarification_need'].to_numpy(dtype = int)
req_data_dev = [(topic_ids_dev[i], init_requests_dev[i], clarification_need_dev[i]) for i in range(len(init_requests_dev))]
req_data_dev = np.array(req_data_dev)
inputs_dev = req_data_dev[:,1]
labels_dev = req_data_dev[:,2]
topics_dev = req_data_dev[:,0]
inputs_enc_dev = []
for req in inputs_dev:
    inputs_enc_dev.append(roberta.encode(req))

In [None]:
roberta.extract_features(inputs_enc_dev[3]).shape

In [None]:
inputs_enc_dev = [roberta.extract_features(input) for input in inputs_enc_dev]

In [None]:
max_len = 0
for i in range(len(inputs_enc_dev)):
    if inputs_enc_dev[i].shape[1]>max_len:
        max_len = inputs_enc_dev[i].shape[1]

In [None]:
max_len

In [None]:
inputs_enc_dev = [torch.squeeze(input) for input in inputs_enc_dev]
inputs_enc_padded_dev =  torch.zeros((len(inputs_enc_dev),max_len,embed_size), dtype=float)
for i in tqdm(range(len(inputs_enc_dev))):
    pad_length = max_len - inputs_enc_dev[i].shape[0]
    inputs_enc_padded_dev[i] = torch.cat((inputs_enc_dev[i],torch.zeros((pad_length, embed_size))))

In [None]:
inputs_enc_padded_dev.shape

In [None]:
torch.save(inputs_enc_padded_dev, 'dev_embeds_roberta.pt')

In [None]:
topics