In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import tensorflow as tf

# set random seeds to make this notebook easier to replicate
tf.keras.utils.set_random_seed(33)

In [2]:
#testing load dataset with a small subset

# display original kaggle data
data = pd.read_csv("data/ner_dataset.csv", encoding = "ISO-8859-1")
train_sents = open('data/small/train/sentences.txt', 'r').readline()
train_labels = open('data/small/train/labels.txt', 'r').readline()


In [3]:
def load_data(file_path):
    with open(file_path,'r') as file:
        data = np.array([ line.strip() for line in file.readlines()])
    return data

In [4]:
train_sentences = load_data('data/large/train/sentences.txt')
train_labels = load_data('data/large/train/labels.txt')

val_sentences = load_data('data/large/val/sentences.txt')
val_labels = load_data('data/large/val/labels.txt')

test_sentences = load_data('data/large/test/sentences.txt')
test_labels = load_data('data/large/test/labels.txt')

In [5]:
#vectorize sentences
def get_vectorize_sentences(sentences):
    """
    Input:array of sentences
    Output: sentence vectorizer callable, vocab based on the data adapted
    """
    
    sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None)
    sentence_vectorizer.adapt(sentences)
    vocab = sentence_vectorizer.get_vocabulary()
    
    return sentence_vectorizer, vocab

In [6]:
test_vectorizer, test_vocab = get_vectorize_sentences(train_sentences[:1000])
sentence = "I like learning new NLP models !"
sentence_vectorized = test_vectorizer(sentence)
sentence_vectorized

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 296,  314,    1,   59,    1,    1, 4649])>

In [7]:
sentence_vectorizer, train_vocab = get_vectorize_sentences(train_sentences)

In [8]:
#create a label tag_map
def tag_set_map(labels):
    
    tag_set=set()
    for element in labels:
        for i in element.split(' '):
            tag_set.add(i)
   
    tag_set=sorted(tag_set)
    tag_map={}
    for i,tag in enumerate(tag_set):
        tag_map[tag]=i
    return tag_map, tag_set

In [9]:
tag_map, tag_set=tag_set_map(train_labels)

In [10]:
#create a label vectorizer and add padding

def label_vectorizer(labels,tag_map=tag_map):
    
    label_ids=[]
    element_id=[]
    for element in labels:
        for i in element.split(' '):
            element_id.append(tag_map[i])
        label_ids.append(element_id)
    
    # Pad the elements
    label_ids = tf.keras.utils.pad_sequences(sequences=label_ids, padding='post', value=-1)
    
    return label_ids

In [11]:
print(f"Sentence: {train_sentences[5]}")
print(f"Labels: {train_labels[5]}")
print(f"Vectorized labels: {label_vectorizer([train_labels[5]], tag_map)}")

Sentence: The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country .
Labels: O O O O O B-gpe O O O O B-geo O O O O O O O B-gpe O O O O O
Vectorized labels: [[16 16 16 16 16  3 16 16 16 16  2 16 16 16 16 16 16 16  3 16 16 16 16 16]]


In [12]:
#Building the dataset

def generate_dataset(sentences, labels, sentence_vectorizer, tag_map):
    
    """
    vectorize sentences
    vectorize labels
    combine sentences = labels
    """
    
    sentences = sentence_vectorizer(sentences)
    labels = label_vectorizer(labels,tag_map)
    dataset = tf.data.Dataset.from_tensor_slices((sentences, labels))
    return dataset

In [None]:
train_dataset = generate_dataset(train_sentences,train_labels,sentence_vectorizer, tag_map)
test_dataset = generate_dataset(test_sentences,test_labels,sentence_vectorizer, tag_map)
val_dataset = generate_dataset(val_sentences,val_labels,sentence_vectorizer, tag_map)

In [None]:
len_tag = len(tag_set)

In [None]:
vocab_size=len(train_vocab)

In [None]:
#building a model

def NER(vocab_size,len_tag):
    
    """
    squence model
    -emb
    -lstm
    -dense
    -log softmax
    """
    
    model = tf.keras.Sequential(name = 'sequential') 
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size+1,output_dim=64,mask_zero=True))
    model.add(tf.keras.layers.LSTM(units=128, return_sequences=True))
    model.add(tf.keras.layers.Dense(units=len_tag,activation='ReLU'))
    
    return model

In [2]:
NER(vocab_size,len_tag)

NameError: name 'vocab_size' is not defined

In [None]:
#masked loss

def masked_loss(y_true,y_pred):
    
    loss_fun = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,ignore_class=-1)
    return loss_fun(y_true, y_pred)

In [None]:
true_labels = [0,1,2,0]
predicted_logits = [[0.1,0.6,0.3] , [0.2,0.7,0.1], [0.1, 0.5,0.4], [0.4,0.4,0.2]]
print(masked_loss(true_labels, predicted_logits))

In [1]:
#masked accuracy

def masked_accuracy(y_true,y_pred):
    """
    total y_true==y_pred except the padding/total y_true without the padding
    """
    
    np.equal()

SyntaxError: unexpected EOF while parsing (550523838.py, line 4)