In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 5.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 20.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 29.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=a64900263a0

In [4]:
from transformers import BertModel, BertTokenizer, XLNetModel, XLNetTokenizer

import torch

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import PorterStemmer, WordNetLemmatizer

import pickle
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
with open('./multilabelbinarizer.pickle', 'rb') as file:
  mlb = pickle.load(file)
with open('./train.pickle', 'rb') as file:
  train_data = pickle.load(file)
with open('./test.pickle', 'rb') as file:
  test_data = pickle.load(file)
  

#Text Cleaning Function

##Full Preprocessing

In [6]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocessingTextFull(text, stop=stop):
  text = text.lower() #text to lowercase
  text = re.sub(r'&lt;', '', text) #remove '&lt;' tag
  text = re.sub(r'<.*?>', '', text) #remove html
  text = re.sub(r'[0-9]+', '', text) #remove number
  text = " ".join([word for word in text.split() if word not in stop]) #remove stopwords
  text = re.sub(r'[^\w\s]', '', text) #remove punctiation
  text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
  for c in ['\r', '\n', '\t'] :
    text = re.sub(c, ' ', text) #replace newline and tab with tabs
  text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
  return text

##Minimum Preprocessing

In [7]:
def preprocessingTextMinimum(text, stop=stop):
  text = text.lower() #text to lowercase
  text = re.sub(r'<.*?>', '', text) #remove html
  text = re.sub(r'&lt;', '', text) #remove '&lt;' tag
  text = re.sub(r'>', '', text) #remove < sign
  text = re.sub(r'[0-9]+', '', text) #remove number
  text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
  for c in ['\r', '\n', '\t'] :
    text = re.sub(c, ' ', text) #replace newline and tab with tabs
  text = re.sub('\s+', ' ', text) #replace multiple spaces with one space

  return text

## Minimum Preprocessing + SEP token

In [8]:
def preprocessingTextMinimumWithSEP(text, sep, stop=stop):
  text = text.lower() #text to lowercase
  text = re.sub(r'<.*?>', '', text) #remove html
  text = re.sub(r'&lt;', '', text) #remove '&lt;' tag
  text = re.sub(r'>', '', text) #remove < sign
  title_len = text.find('\n')
  text = text[:title_len] + ' ' + sep + ' ' + text[title_len+1:]
  text = re.sub(r'\.\n', '. '+sep+' ',text)
  text = re.sub(r'[0-9]+', '', text) #remove number
  text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
  for c in ['\r', '\n', '\t'] :
    text = re.sub(c, ' ', text) #replace newline and tab with tabs
  text = re.sub('\s+', ' ', text) #replace multiple spaces with one space

  return text

#Embedding Function

##Embedding Text

In [None]:
def embedding_text(list_of_text, model, tokenizer, seq_len=128, mode='all'):

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model.eval()
  
  model = model.to(device)
  
  embedding_features = torch.zeros(len(list_of_text), seq_len, 768)
  print(len(list_of_text), seq_len)
  
  for index, text in enumerate(list_of_text):
    token = tokenizer.encode_plus(text,
                                  max_length=seq_len,
                                  pad_to_multiple_of=seq_len,
                                  padding=True,
                                  truncation=True)

    token_input = torch.tensor(token['input_ids']).unsqueeze(0).to(device)
    token_mask = torch.tensor(token['attention_mask']).unsqueeze(0).to(device)
    
    with torch.no_grad():
      if mode == 'all' :
        result = model(token_input)['hidden_states'][1:]
        result = torch.stack(result)
        result = result.squeeze(1)
        result = result.permute(1, 0, 2).sum(1)

      elif mode == 'last4':
        result = model(token_input, token_mask)['hidden_states'][-4:]
        result = torch.stack(result)
        result = result.squeeze(1)
        result = result.permute(1, 0, 2).sum(1)
      
      elif mode == 'last':
        result = model(token_input, token_mask)['hidden_states'][12]
        result = result.squeeze(0)
      
      else:
        print('The mode is not recognized')
        break
    
    embedding_features[index] = result.cpu()
  
  return embedding_features



##Embedding Label

In [None]:
def embedding_label(list_of_label, model, tokenizer, mode='all'):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model.eval()
  
  model = model.to(device)
  
  embedding_features = torch.zeros(len(list_of_label), 768)
  
  for index, text in enumerate(list_of_label):

    text = ' '.join(text.split('-'))
    token = tokenizer.encode(text)
    token = torch.tensor(token).unsqueeze(0).to(device)
    
    with torch.no_grad():
      if mode == 'all' :
        result = model(token)['hidden_states'][1:]
        result = torch.stack(result)
        result = result.squeeze(1)
        if model.name_or_path == 'bert-base-cased':
          result = result.permute(1, 0, 2)[1:-1].sum(1).mean(0)
        elif model.name_or_path == 'xlnet-base-cased':
          result = result.permute(1, 0, 2)[:-2].sum(1).mean(0)

      elif mode == 'last4':
        result = model(token)['hidden_states'][-4:]
        result = torch.stack(result)
        result = result.squeeze(1)
        if model.name_or_path == 'bert-base-cased':
          result = result.permute(1, 0, 2)[1:-1].sum(1).mean(0)
        elif model.name_or_path == 'xlnet-base-cased':
          result = result.permute(1, 0, 2)[:-2].sum(1).mean(0)
      
      elif mode == 'last':
        result = model(token)['hidden_states'][12]
        result = result.squeeze(0)
        if model.name_or_path == 'bert-base-cased':
          result = result[1:-1].mean(0)
        elif model.name_or_path == 'xlnet-base-cased':
          result = result[:-2].mean(0)
      
      else:
        print('The mode is not recognized')
        break
    
    embedding_features[index] = result.cpu()
  
  return embedding_features

#Initialize Language Model and Tokenizer

In [None]:
#Uncomment model configuration that you need

# model = BertModel.from_pretrained('bert-base-cased', output_hidden_states=True)
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# sep = '[SEP]'

model = XLNetModel.from_pretrained('xlnet-base-cased', output_hidden_states=True)
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
sep = '<sep>'

NameError: ignored

#Clean Text

In [None]:
#Uncomment one pair train and test data preprocessing that you need

# train_text = [preprocessingTextFull(text) for text in train_data.text.values]
# test_text = [preprocessingTextFull(text) for text in test_data.text.values]

train_text = [preprocessingTextMinimum(text) for text in train_data.text.values]
test_text = [preprocessingTextMinimum(text) for text in test_data.text.values]

# train_text = [preprocessingTextMinimumWithSEP(text, sep=sep) for text in train_data.text.values]
# test_text = [preprocessingTextMinimumWithSEP(text, sep=sep) for text in test_data.text.values]

#Generate Text embedding and Label Embedding

In [None]:
seq_len = 128
mode = 'all'
save_path = './'
preprocessing = 'full' #choose how to name your file indicating text cleaning process

torch.save(embedding_text(train_text, model, tokenizer, seq_len=seq_len, mode=mode),
           save_path+('train-' + model.name_or_path + '-' + str(seq_len) + '-' +preprocessing+'.pt'))
torch.save(embedding_text(test_text, model, tokenizer, seq_len=seq_len, mode=mode),
           save_path+('test-' + model.name_or_path + '-' + str(seq_len) + '-' +preprocessing+'.pt'))
torch.save(embedding_label(mlb.classes_, model, tokenizer, mode=mode),
           save_path+('label-embedding-' + model.name_or_path +'.pt'))



7769 128
3019 128
