# 0. Load required modules.

In [3]:
!python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import datasets
import spacy
from transformers import AutoTokenizer
from tqdm import tqdm_notebook
import pickle

from transformers import RobertaTokenizer, RobertaModel, RobertaConfig, RobertaForSequenceClassification
import torch
import numpy as np

# 1. Load tokenizer and dependency extraction module.

In [3]:
nlp = spacy.load('en_core_web_sm')

tokenizer = RobertaTokenizer('/home/skhong/jiant/roberta/wsc/models/roberta-base/tokenizer/vocab.json','/home/skhong/jiant/roberta/wsc/models/roberta-base/tokenizer/merges.txt')

# 2. Load Dataset
- Preprocessed dataset used in this study.

In [34]:
dataset = datasets.Dataset.from_file("/home/skhong/.cache/huggingface/datasets/super_glue/wsc/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/super_glue-train.arrow")

## 2.1 Dataset Structure
- Since the dataset consists of two sentences, namely a premise and a hypothesis, there is an assumption that a [SEP] token is included between the two sentences. 
- Therefore, it is necessary to extract relationships between words within the sentences.

In [35]:
dataset

Dataset(features: {'text': Value(dtype='string', id=None), 'span1_index': Value(dtype='int32', id=None), 'span2_index': Value(dtype='int32', id=None), 'span1_text': Value(dtype='string', id=None), 'span2_text': Value(dtype='string', id=None), 'idx': Value(dtype='int32', id=None), 'label': ClassLabel(num_classes=2, names=['False', 'True'], names_file=None, id=None)}, num_rows: 554)

In [36]:
dataset['text'][0]

'Mark told Pete many lies about himself, which Pete included in his book. He should have been more skeptical.'

In [39]:
dataset['span2_text'][0]

'He'

# 3. Random_token_select Function
- A function that randomly selects one token from tokens excluding tokens with dependencies.

In [6]:
import random

def random_value_except(lst, excluded_value):
    filtered_list = [item for item in lst if item != excluded_value]

    if filtered_list:
        random_value = random.choice(filtered_list)
        return random_value
    else:
        return None


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

stop_words_list = stopwords.words("english")

def get_tfidf_vector(sentence,i):
  # Join the tokenized words back into a sentence
  if i==1:
      vectorizer = TfidfVectorizer(stop_words=stop_words_list,norm=None)
  elif i==2:
      vectorizer = TfidfVectorizer(stop_words=stop_words_list,norm='l1')
  elif i==3:
     vectorizer = TfidfVectorizer(stop_words=stop_words_list)
  tfidf_vector = vectorizer.fit_transform(sentence)
    
  # Return the TF-IDF vector
  return tfidf_vector,vectorizer


    
def get_tfidf(tokenizer,input_ids,i):
    def ids_to_string(ids):
        ids = [ids]
        lst = [v for v in ids if v != 0]
        lst = [tokenizer._convert_id_to_token(i) for i in lst]
        return lst    
    temp = list(map(ids_to_string,input_ids))
    sents = []
    for sen in temp:
        sents.append(' '.join(sen))
        
    tfidfvec,vec = get_tfidf_vector(sents,i)
    tfidf_ids = []
                
    for _,sen in enumerate(sents):
        tmp = []
                    
        for i,word in enumerate(sen.split(' ')):
            if word in vec.vocabulary_.keys():
                try:
                    tmp.append(tfidfvec[i].toarray()[0][vec.vocabulary_[word]])
                except:
                    print(i,word)
            else:
                tmp.append(0)
                        
        tmp = list(map(round,tmp))
                    
                    
        tmp += [0]*(512-len(tmp))
        #tmp = [0]*(128)
        assert len(tmp)==512, (tmp,len(tmp))
        tfidf_ids.append(tmp)
        #regularization_ids.append([0]*(128))
    return tfidf_ids

# 4. Dataset Generater
- Compare tokenization results of two tokenizers, identify sentences with inter-token dependencies, extract token positions within those sentences, as well as positions of tokens without inter-token dependencies.

In [40]:
data_pos = []
data_set1 = []
data_set2 = []

shuffle_index = [i for i in range(len(dataset['label']))]
random.shuffle(shuffle_index)

for i in tqdm_notebook(shuffle_index):    
    y_temp = []
    
    text1 = dataset['text'][i]
    doc1 = nlp(text1)
    tokens1_1 = [d.text for d in doc1]
    tokens2_1 = tokenizer.tokenize(text1)
    
    text2 = dataset['span1_text'][i] + " "+dataset['span2_text'][i]
    doc2 = nlp(text2)
    tokens1_2 = [d.text for d in doc2]
    tokens2_2 = tokenizer.tokenize(text2)  
    
    encoded = tokenizer(text1, padding='max_length', max_length=512, truncation=True)
    tfidf_ids = get_tfidf(tokenizer,encoded['input_ids'],3)
    
    input_ids = torch.Tensor([encoded['input_ids']]).type(torch.int32).cuda()
    attention_mask = torch.Tensor([encoded['attention_mask']]).type(torch.int32).cuda()
    tfidf_ids = torch.Tensor([tfidf_ids]).type(torch.int32).cuda()
    
    # 
    for token in doc1:
        if (token.text in tokens2_1) and (token.head.text in tokens2_1):
            random_numbers = [ii+1 for ii in range(len(tokens1_1))]
            i_pos = tokens2_1.index(token.text) + 1
            j_pos = tokens2_1.index(token.head.text) + 1
            j_random_pos = random_value_except(random_numbers, j_pos)
        
            data_pos.append((input_ids, attention_mask, tfidf_ids))
            data_set1.append((i_pos, j_pos))
            data_set2.append((i_pos, j_random_pos))
            
            if len(data_pos) % 20 == 0:
                with open('data_pos.pickle', 'wb') as f:
                    pickle.dump(data_pos, f, pickle.HIGHEST_PROTOCOL)
                with open('data_set1.pickle', 'wb') as f:
                    pickle.dump(data_set1, f, pickle.HIGHEST_PROTOCOL)
                with open('data_set2.pickle', 'wb') as f:
                    pickle.dump(data_set2, f, pickle.HIGHEST_PROTOCOL)
                print(len(data_pos))
                
            if len(data_pos) >= 2000:
                break
                
            break
                
    if len(data_pos) >= 2000:
        break
            
    for token in doc2:
        if (token.text in tokens2_2) and (token.head.text in tokens2_2):
            random_numbers = [ii+1 for ii in range(len(tokens1_1)+1, len(tokens1_1)+1+len(tokens1_2))]
            i_pos = tokens2_2.index(token.text) + 2 + len(tokens2_1)
            j_pos = tokens2_2.index(token.head.text) + 2 + len(tokens2_1)
            j_random_pos = random_value_except(random_numbers, j_pos)
        
            data_pos.append((input_ids, attention_mask, tfidf_ids))
            data_set1.append((i_pos, j_pos))
            data_set2.append((i_pos, j_random_pos))
            
            if len(data_pos) % 20 == 0:
                with open('data_pos.pickle', 'wb') as f:
                    pickle.dump(data_pos, f, pickle.HIGHEST_PROTOCOL)
                with open('data_set1.pickle', 'wb') as f:
                    pickle.dump(data_set1, f, pickle.HIGHEST_PROTOCOL)
                with open('data_set2.pickle', 'wb') as f:
                    pickle.dump(data_set2, f, pickle.HIGHEST_PROTOCOL)
                print(len(data_pos))
                
            if len(data_pos) >= 2000:
                break
        break

    if len(data_pos) >= 2000:
        break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(shuffle_index):


HBox(children=(FloatProgress(value=0.0, max=554.0), HTML(value='')))

20
40
60
80
100
120
140
160
180



In [41]:
import pickle
with open('./validation/wsc/data_pos.pickle', 'wb') as f:
    pickle.dump(data_pos, f, pickle.HIGHEST_PROTOCOL)
with open('./validation/wsc/data_set1.pickle', 'wb') as f:
    pickle.dump(data_set1, f, pickle.HIGHEST_PROTOCOL)
with open('./validation/wsc/data_set2.pickle', 'wb') as f:
    pickle.dump(data_set2, f, pickle.HIGHEST_PROTOCOL)