In [1]:
from configuration import FilePaths, Config
import tensorflow as tf
import numpy as np

FILE_PATHS = FilePaths()

# import saved data from pickle files

import pickle

with open(FILE_PATHS.X_TRAIN, "rb") as f:
    x_train = pickle.load(f)

with open(FILE_PATHS.X_TEST, "rb") as f:
    x_test = pickle.load(f)

with open(FILE_PATHS.Y_TRAIN, "rb") as f:
    y_train = pickle.load(f)

with open(FILE_PATHS.Y_TEST, "rb") as f:
    y_test = pickle.load(f)

with open(FILE_PATHS.X_VAL, "rb") as f:
    x_val = pickle.load(f)

with open(FILE_PATHS.Y_VAL, "rb") as f:
    y_val = pickle.load(f)
    

In [10]:
def make_embedding_matrix(train_samples, embeddings_index):
    """
    This function computes the embedding matrix that will be used in the embedding layer

    Parameters:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        embeddings_index: Python dictionary with word embeddings

    Returns:
        embedding_matrix: embedding matrix with the dimensions (num_tokens, embedding_dim), where num_tokens is the vocabulary of the input data, and emdebbing_dim is the number of components in the GloVe vectors (can be 50,100,200,300)
        vectorizer: TextVectorization layer
    """

    vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=Config.max_features, output_sequence_length=Config.max_len
    )
    text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(Config.batch_size)
    vectorizer.adapt(text_ds)

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))

    num_tokens = len(voc)

    hits = 0
    misses = 0

    #   creating an embedding matrix
    embedding_dim = len(embeddings_index["the"])
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1

    print(f"Converted {hits} words ({misses} misses).")

    return embedding_matrix, vectorizer

In [11]:
#Load Glove Embeddings 
embeddings_index = {}

f = open(FILE_PATHS.FILE_TO_GLOVE, 'r', encoding='utf8')
for line in f:
    splitLine = line.split(' ')
    word = splitLine[0]                                  # the first entry is the word
    coefs = np.asarray(splitLine[1:], dtype='float32')   # these are the vectors representing word embeddings
    embeddings_index[word] = coefs
print("Glove data loaded! In total:",len(embeddings_index)," words.")

Glove data loaded! In total: 400000  words.


In [13]:
embedding_matrix, vectorizer = make_embedding_matrix([item[0] for item in x_train], embeddings_index=embeddings_index)

Converted 4076 words (924 misses).


In [14]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'dog',
 'per',
 'x',
 'of',
 'tablet',
 'o',
 'nobivac',
 'treatment',
 '1',
 'plan',
 'give',
 '2',
 'for',
 'checked',
 'only',
 '4',
 'date',
 'vaccination',
 'mrcvs',
 'examination',
 'animal',
 'b',
 '100',
 'nad',
 'keep',
 'disp',
 'reason',
 'presc',
 'daily',
 'expiry',
 'fee',
 'appointment',
 'notes',
 'check',
 '3',
 'doc',
 'vacc',
 'out',
 'no',
 'reach',
 'children',
 'sight',
 'lepto',
 'one',
 'days',
 'food',
 'room',
 'normal',
 'instructions',
 'ok',
 'exam',
 'assessment',
 'kc',
 'tabs',
 '50',
 'dispensed',
 'owner',
 'ml',
 'back',
 'well',
 'health',
 'prescribed',
 'by',
 'label',
 'ear',
 'adv',
 'every',
 'fine',
 'history',
 'booster',
 'recheck',
 'dispensing',
 'dhp',
 'ears',
 'weeks',
 'bar',
 'resexam',
 'metacam',
 'discount',
 '10',
 'twice',
 'eye',
 '1yr',
 'skin',
 'report',
 'advised',
 'injection',
 'tablets',
 '10ml',
 'patient',
 'docexaminations',
 'card',
 'months',
 'varexaminations',
 'varvaccinations',
 'today',
 'inj',
 '

In [17]:
np.array([[s] for s in x_train]).shape

(2406, 1, 31)

In [20]:
[s for s in x_train]

[['Kennel Cough vaccination Reason : 2nd Vacc Appointment Notes : ay 24/7 History : Doing fine DUDE ok Examination : BAR H/L ok MMpink CRT < 2sec T38 Both testicles present Id chip checked Advised dental check 6 month , discussed dental care Advised get used eyes/ ears / feet/ mouth checked Assessment : Plan : Advise daily dental care . Advise POM-V parasiticides prescription validity period 1 year . Advise Complete Care Advise Specific Life Stage diets Vaccinations : Kennel Cough DOC-Procs ; Doc : Vaccination Certificate ; Package - Best Start In Life ; Vacc 1yr KC/ Bordetella / Kennel Cough ; Vacc 1yr Dog ; Doc : Patient Report Card Dog ; VAR-Vaccinations ; Miscellaneous Charge-Certificates ; Miscellaneous Charge-Certificates ; RES-Prep / Treatment Room ; Nobivac DHP ( 50 box ) ; Nobivac Lepto 4 ( 50 box ) ; Nobivac KC ( 25 box ) ; Advocate 100 medium dog ( 4-10kg ) per pipette ( 21 ) Ollie : Dog 2.00 x Advocate 100 medium dog ( 4-10kg ) per pipette ( 21 ) : Expiry Date : Apply 1 pip