In [None]:
#PyTorch
import torch.nn #Parent module for pytorch for neural networks 
import torch.nn.functional as F #for the activation function 
from torch.utils.data import TensorDataset, DataLoader

# Hugging Face ML packages 
from transformers import pipeline, BertTokenizer, DataCollatorWithPadding, BertForMaskedLM
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
unmasker = pipeline('fill-mask', model='bert-large-uncased-whole-word-masking')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')
from datasets import Dataset

#Import wordnet synsets for vocabulary 
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

#Other math imports 
import random
import pandas as pd

In [133]:
#Helper Functions to create dataset 

#Word masking function from string str with python random engine 
def maskWord(string): 

    #Set type to string (wordnet includes integers)
    if type(string) != str: 
        string = str(string)

    str_len = len(string)
    if str_len == 0: 
        return 

    #Generate indexes based on length of word 

    #Don't wanna mask the entire word
    if str_len < 3: 
        choices = random.choices(range(str_len), k = 1)
    else: 
        choices = random.choices(range(str_len), k = random.randint(2,str_len - 1))

    ##Unpack string into a list of characters
    str_list = [*string]
    for index in choices: 
        str_list[index] = '_'
    newstr = ''.join(str_list)
    return newstr

#Space a masked word
def spaceWord(word): 
    if type(word) != word: 
        word = str(word)

    k = len(word)
    wordList = [*word]
    newList = []
    
    for i in range(k): 
        newList.append(wordList[i])
        newList.append(' ')

    return ''.join(newList) 

#Replace mask with [MASK] Special token 
def replaceMask(word): 
    strList = [*word]
    for i in range(len(strList)): 
        if strList[i] == '_': 
            strList[i] = '[MASK]'
    return ''.join(strList)

# Final combination of functions 
def BertMaskWord(word): 
    return replaceMask(spaceWord(maskWord(word)))

#Uses synsets to generate definition 
def getFirstDef(word): 
    return wordnet.synsets(word)[0].definition()

In [134]:
def sampleAndRemove(stringList, size): 
    sample = []
    sample_indices = (random.sample(range(0,len(stringList)), size))
    sample_dict = {}
    remaining_words = []

    # Uses dictionary of indices to remove to reduce compute time to large O(n) 
    for i in range(len(stringList)): 
        sample_dict[i] = 0
    for index in sample_indices: 
        sample_dict[index] = 1

    for index in range(len(stringList)): 
        if sample_dict[index] == 1: 
            sample.append(stringList[index])
        else: 
            remaining_words.append(stringList[index])
    return sample, remaining_words

In [136]:
#tokenizing helper functions for mapping 
def tokenize_function(example): 
    return tokenizer(example['word'], example['word_masked'], example['definition'], padding = "max_length", max_length = 512)

def tokenize_maskOnly(example):
    return tokenizer(example['masked'], padding = "max_length", max_length = 512, truncation = True)
    

def tokenize_labels(example): 
    return tokenizer(example['word'], padding = "max_length", max_length = 512, truncation = True)

def tokenize_combined(example): 
    return tokenizer(example['Mask&Def'], padding = "max_length", max_length = 512, truncation = True)
dataCollator = DataCollatorWithPadding(tokenizer)

In [135]:
#Collect all words in the synset 
all_words = set(word for synset in wordnet.all_synsets() for word in synset.lemma_names())
fullWordList = [word for word in all_words] #148730 total words 

#Creating training and validation set using sample and remove 
trainset, remaining_words  =  sampleAndRemove(fullWordList, 5000)
valset, discarded_words = sampleAndRemove(remaining_words,100)
#Discarded words are not used 

F r [MASK] n c h [MASK] h o [MASK] e y s [MASK] c [MASK] l e  French_honeysuckle
5000


In [137]:
#Creating validation set for testing 
valset_maskedOnly = {'masked':[]}
valset_labels = {'word': []}
for i in range(len(valset)): 
    #Keep words and embeddings for testing 
    valset_labels['word'].append(valset[i])
    valset_maskedOnly['masked'].append(("{} [SEP] ".format(getFirstDef(valset[i])) + BertMaskWord(valset[i])))

val_ls = Dataset.from_dict(valset_labels)
val_ms = Dataset.from_dict(valset_maskedOnly)

valLabels_ds = val_ls.map(tokenize_labels)
valMasked_ds = val_ms.map(tokenize_maskOnly)

#Combine labels (original word) with masked inputs 
valFinalDict = {'labels' : [], 'input_ids' : [], 'attention_mask' : []}
valFinalDict['input_ids'] = valMasked_ds['input_ids']
valFinalDict['attention_mask'] = valMasked_ds['attention_mask']
valFinalDict['labels'] = valLabels_ds['input_ids']

valFinal = Dataset.from_dict(valFinalDict)
#Load into loader farther down 

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

course_of_study
an integrated course of academic studies [SEP] c [MASK] [MASK] r [MASK] [MASK] [MASK] [MASK] f [MASK] [MASK] t u [MASK] [MASK] 
[101, 2019, 6377, 2607, 1997, 3834, 2913, 102, 1039, 103, 103, 1054, 103, 103, 103]


Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [138]:
#Separated out dictionaries for flexibility during testing 
maskedOnly = {'masked': []}
labelsSet = {'word': []}
maskDef = {'Mask&Def': []}

for i in range(len(trainset)): 
    maskedOnly['masked'].append(BertMaskWord(trainset[i]))
    labelsSet['word'].append(spaceWord(trainset[i]))
    maskDef['Mask&Def'].append("{} [SEP] ".format(getFirstDef(trainset[i])) + BertMaskWord(trainset[i]))


#Next idea: if there's an underscore, set to 103 (mask token)

maskDef = Dataset.from_dict(maskDef)

maskedSet = Dataset.from_dict(maskedOnly)
labelsSet = Dataset.from_dict(labelsSet)

G u l f _ o f _ A l a s k a 
a gulf of the Pacific Ocean between the Alaska Peninsula and the Alexander Archipelago [SEP] G u [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] A l [MASK] s [MASK] [MASK] 


In [140]:
#Creating datasets separately with each function to have different loaders at will
labels_ds = labelsSet.map(tokenize_labels)
masked_ds = maskedSet.map(tokenize_maskOnly)
combined_ds = maskDef.map(tokenize_combined)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['word', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5000
})
G u l f _ o f _ A l a s k a 
G u [MASK] f [MASK] o f [MASK] A l [MASK] [MASK] k [MASK] 
[101, 1043, 1057, 1048, 1042, 1035, 1051, 1042, 1035, 1037, 1048, 1037, 1055, 1047, 1037]
[101, 1043, 1057, 103, 1042, 103, 1051, 1042, 103, 1037, 1048, 103, 103, 1047, 103]


In [None]:
#Originally for training the model using only the spaced and masked words 
finalDict = {'labels' : [], 'input_ids' : [], 'attention_mask' : []}
#finalDict['input_ids'] = masked_ds['input_ids']
finalDict['input_ids'] = masked_ds['input_ids']
finalDict['attention_mask'] = masked_ds['attention_mask']
finalDict['labels'] = labels_ds['input_ids']

final_ds = Dataset.from_dict(finalDict)

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 500
})

In [None]:
#Used for training the model with blanks and definitions 
combined_final = {'labels' : [], 'input_ids' : [], 'attention_mask' : []}
combined_final['input_ids'] = combined_ds['input_ids']
combined_final['attention_mask'] = combined_ds['attention_mask']
combined_final['labels'] = labels_ds['input_ids']

combined_final = Dataset.from_dict(combined_final)

combined_final

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 5000
})

In [None]:
#Initialization code for various dataLoaders for training and validation 

#train_tokenized = final_ds.with_format("torch")

valTokenized = valFinal.with_format("torch")

#combined_tokenized = combined_final.with_format("torch")

#masked_train = DataLoader(train_tokenized, batch_size = 16, collate_fn = dataCollator)

#combined_train = DataLoader(combined_tokenized, batch_size = 16, collate_fn = dataCollator)

val_DataLoader = DataLoader(valTokenized, batch_size = 1, collate_fn = dataCollator)

In [None]:
#Training loop for attention head 
from torch import optim
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
epochs = 5
current_loader = combined_train #For consistency and ease sake
batchNumber = 0 

for i in range(epochs): 
   total_loss = 0
   for batch in current_loader: 
      optimizer.zero_grad()
      input_ids = batch['input_ids']
      attentionMask = batch['attention_mask']
      labels = batch['labels']

      outputs = model(input_ids, attention_mask = attentionMask, labels= labels)
      #model.convert_tokens_to_string(model.convert_ids_to_tokens(outputs)) <- can't figure out how to interpret Hugging Face's .MaskedLMOutput class and view results

      loss = outputs.loss

      total_loss += outputs.loss

      loss.backward()

      optimizer.step()
      if batchNumber%100 == 0: 
         print("Batch: {}".format(batchNumber))

      batchNumber += 1   

   print("Epoch {} loss: {}".format(i + 1, total_loss.item()/len(current_loader)))

Batch: 100
Epoch 1 loss: 0.20329707575301392
Epoch 2 loss: 0.2030935439819726
Epoch 3 loss: 0.20299759154883437
Epoch 4 loss: 0.2029394814000724
Epoch 5 loss: 0.20289968911070413


NameError: name 'val_DataLoader' is not defined

In [None]:
#Push to hugging face hub
from huggingface_hub import notebook_login

#notebook_login() <- hopefully I don't have to log in every time 
#Saving and pushing model
model.save_pretrained("Saama-model")

model.push_to_hub("Saama-model")

In [None]:
# reload for testing 
model2 = BertForMaskedLM.from_pretrained('Brijhs/Saama-model')

In [None]:
from F import softmax #Activation function: perhaps the wrong one? 

#Functions for testing predictions; most results are coming out to zeroes 
def checkPrediction(prediction, discarded):
    final = []
    for i in range(len(prediction)): 
        if (i + 1)%64 == 0: 
            print("Prediction counter: {}".format(str(i + 1)))
        current = tokenizer.decode(prediction[i])
        for element in current: 
            if element not in discarded: 
                final.append(element)
    return final 

def interpretPrediction(outputLogits): 
    #Separated out into characters because .decode() separates by character and these are all tokens of [PAD]
    discarded = ['[', ']', ' ', 'P', 'A', 'D']
    sm = softmax(outputLogits, dim = -1)
    prediction = torch.argmax(sm, dim= -1)
    final = []
    for i in range(len(prediction)): 
        for item in prediction[i]: 
            if item != 0: 
                final.append(item)
                print("Woah")

    return checkPrediction(final, discarded)

In [143]:
#Prediction loop for validation: returns all 0's 
for batch in val_DataLoader: 
    input = batch['input_ids']
    attention_mask = batch['attention_mask']
    output = model2(input, attention_mask = attention_mask)
    label = batch['labels']

    result = interpretPrediction(output.logits)
    if len(result)>0: 
        print(result)