In [3]:
import sentencepiece as spm
from transformers import XLNetTokenizer
from gensim.models import Word2Vec,word2vec
from collections import Counter
from tqdm import tqdm
import string
import nltk
import logging
import numpy as np
nltk.download('punkt')
import json

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
#pre-process the given text and return array of pure sentences
def normalize_text(path):
  normalized_text = []
  with open(path, 'r', encoding ='utf-8') as file:
    text_data = file.readlines()
    for line in text_data:
      if len(line) > 2:
        regex_tokenizer = nltk.RegexpTokenizer("\w+")
        line = " ".join(regex_tokenizer.tokenize(line))
        normalized_text.append(line)
    return normalized_text  

def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as file:
    for line in file:
      voc.append(line.split("\t")[0])
  voc = voc[1:]
  return voc

def read_sentences(f_path):
  with open(f_path,'r',encoding = 'utf-8') as f:
    dataset = f.readlines()
  return dataset

def store(comments, path):
    with open(path, 'w', encoding='utf-8') as f:
        f.write("\n".join(comments))

def train_embedding(model_path, sentences, dimension, window, min_count,iter):
    model = word2vec.Word2Vec(sentences, size=dimension, window=window, min_count=min_count, max_vocab_size=32000,iter=iter)
    model.save(model_path) 

In [0]:
dataset_path = 'data/pure_text_for_embed.txt'
normalized_text = normalize_text(dataset_path)
store(normalized_text, 'dataset.txt')

In [0]:
#Train a sentencePiece tokenizer and save it to tokenizer.model with vocab file tokenizer.vocab
spm.SentencePieceTrainer.train('--input=dataset.txt \
                                --model_prefix=tokenizer \
                                --shuffle_input_sentence=true \
                                --bos_id=1 \
                                --eos_id=2 \
                                --pad_id=5 \
                                --user_defined_symbols=<mask>,<cls>,<sep> \
                                --vocab_size=31900')

In [15]:
tokenizer = XLNetTokenizer.from_pretrained('tokenizer.model', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent.rstrip('\n')) for sent in normalized_text]

Calling XLNetTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [0]:
model_path_fast ="data/fasttext.model"
model_path = home_path + 'data/word2vec.model'
dimension = 768
window = 5
min_count = 2
iter = 20

In [0]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
train_embedding(model_path, tokenized_texts, dimension, window, min_count,iter)

2020-01-27 19:31:26,752 : INFO : collecting all words and their counts
2020-01-27 19:31:26,753 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-27 19:31:27,050 : INFO : PROGRESS: at sentence #10000, processed 1446223 words, keeping 25367 word types
2020-01-27 19:31:27,333 : INFO : PROGRESS: at sentence #20000, processed 2877095 words, keeping 27046 word types
2020-01-27 19:31:27,617 : INFO : PROGRESS: at sentence #30000, processed 4178766 words, keeping 28253 word types
2020-01-27 19:31:27,878 : INFO : PROGRESS: at sentence #40000, processed 5485280 words, keeping 29243 word types
2020-01-27 19:31:28,162 : INFO : PROGRESS: at sentence #50000, processed 6839237 words, keeping 30237 word types
2020-01-27 19:31:28,442 : INFO : PROGRESS: at sentence #60000, processed 8211451 words, keeping 31127 word types
2020-01-27 19:31:28,703 : INFO : PROGRESS: at sentence #70000, processed 9442904 words, keeping 31920 word types
2020-01-27 19:31:28,743 : INFO : pruned

In [0]:
def load_static_embeddings(f_path):
  with open(f_path) as f:
    data = json.load(f)
  return data
def pad_char_embedd(char_value):
  #Static embeddings for paddings downloaded from official XLNet model
  static_embeddings = load_static_embeddings("data/special_tokens_emb.json")
  if char_value == "PAD":
    embedding = static_embeddings["pad"]
  #Embeddings for Unknown token
  elif char_value == "UNK":
    embedding = static_embeddings['unk']
  #Embeddings for token sep
  elif char_value == "SEP":
    embedding = static_embeddings["sep"]
  #Embeddings for token cls
  elif char_value == "CLS":
    embedding = static_embeddings["cls"]
  #Embeddings for token mask
  elif char_value == "MASK":
    embedding = static_embeddings["mask"]
  #Embeddings for token eop
  elif char_value == "EOP":
    embedding = static_embeddings["eop"]
  #Embeddings for token eod
  elif char_value == "EOD":
    embedding = static_embeddings["eod"]
  del static_embeddings 
  return embedding

In [0]:
def get_initial_embeddings_for_xlnet(model):
    embeddings = []
    valid,sep,unk,pad,cls,eod,eop,mask,other = 0,0,0,0,0,0,0,0,0
    for i in range(32000):
        token = tokenizer.convert_ids_to_tokens(i)
        try:
            token_embedd = model.wv[token]
            valid += 1
        except:
            if token in ["<SEP>","[sep]","<sep>","[SEP]"]:
                token_embedd = pad_char_embedd("SEP")
                sep += 1
            elif token in ["<CLS>","[cls]","<cls>","[CLS]"]:
                token_embedd = pad_char_embedd("CLS")
                cls += 1
            elif token in ["<UNK>","[unk]","<unk>","[UNK]"]:
                token_embedd = pad_char_embedd("UNK")
                unk += 1
            elif token in ["<PAD>","[pad]","<pad>","[PAD]"]:
                token_embedd = pad_char_embedd("PAD")
                pad += 1
            elif token in ["<EOP>","[eop]","<eop>","[EOP]"]:
                token_embedd = pad_char_embedd("EOP")
                eop += 1
            elif token in ["<EOD>","[eod]","<eod>","[EOD]"]:
                token_embedd = pad_char_embedd("EOD")
                eod += 1
            elif token in ["<MASK>","[mask]","<mask>","[MASK]"]:
                token_embedd = pad_char_embedd("MASK")
                mask += 1
            else:
                token_embedd =  np.random.rand(768,)
                other += 1
        embeddings.append(token_embedd) 
    counters = [valid,sep,unk,pad,cls,eod,eop,mask,other]
    return embeddings, counters

In [21]:
word2vecmodel = Word2Vec.load(model_path)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
i_embeddings,counters = get_initial_embeddings_for_xlnet(word2vecmodel)

In [0]:
np.savetxt("data/initial_embeddings_for_xlnet.txt",np.array(i_embeddings))