In [9]:
# Base imports
import numpy as np
import pandas as pd
import os
import transformers

In [10]:
# Check for GPU
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
else:
    print('Using CPU.')
    device = torch.device("cpu")

There are 1 GPU(s) available.


In [11]:
# Load the data and convert it into ndarrays
asl_train = pd.read_csv('train.csv')
gloss_values = asl_train.gloss.values
text_values = asl_train.text.values

In [12]:
# Initialize BERT tokenizers
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Convert each word into a unique token
speechID = []
for speech in text_values:
    speechID.append(tokenizer.encode(speech, add_special_tokens=True))

print('Original: ', text_values[0])
print('Token IDs:', speechID[0])

Original:  ï»¿membership of parliament see minutes

Token IDs: [101, 5779, 1997, 3323, 2156, 2781, 102]


In [13]:
# Truncate the sentences to have the same length
from keras_preprocessing.sequence import pad_sequences

MAX_LEN = 64

print('\n Truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
speechID = pad_sequences(speechID, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")


 Truncating all sentences to 64 values...

Padding token: "[PAD]", ID: 0


In [14]:
# Create masks in the input
masks = []
for speech in speechID:
  mask = [int(token_id > 0) for token_id in speech]
  masks.append(mask)