<a href="https://colab.research.google.com/github/bforoura/Transformers/blob/main/nlp_transformers_ch2_ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Activating the GPU
# Main menu->Runtime->Change Runtime Type

import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')

print('Found GPU at: {}'.format(device_name))


Found GPU at: /device:GPU:0


In [4]:
#@title Installing the Hugging Face PyTorch Interface for Bert

!pip install -q transformers



In [5]:
#@title Importing the modules

import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup


In [6]:
from tqdm import tqdm, trange

import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt



In [7]:
#@title Specify CUDA as device for Torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

torch.cuda.get_device_name(0)


'Tesla T4'

In [14]:
#@title Loading the Dataset


df = pd.read_csv("https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/main/Chapter02/in_domain_train.tsv", 
                 delimiter='\t', header=None, 
                 names=['sentence_source', 'label', 'label_notes', 'sentence'])

df.shape


(8551, 4)

In [19]:
df.sample(10)

Unnamed: 0,sentence_source,label,label_notes,sentence
2489,l-93,1,,she mumbled .
4,gj04,1,,day by day the facts are getting murkier .
4729,ks08,1,,this bed was surely slept in by a huge guy las...
402,bc01,0,*,what the hell do you wonder how to say ?
2300,l-93,0,*,that acorn will grow from a seed into an oak t...
4021,ks08,0,*,the foxes seem compatible for the chickens .
1967,r-67,0,*,i deny that that bob has any money is certain .
733,bc01,0,*,john was cost $ 10 by the book .
6696,m_02,1,,emma gave bad advice to harriet .
5245,kl93,0,*,almost an owl hunts mice .


In [26]:
#@title Creating sentence, label lists and adding Bert tokens
sentences = df.sentence.values

# Adding CLS and SEP tokens at the beginning and end of each sentence for BERT

sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

# print 10
sentences[:10]

["[CLS] our friends wo n't buy this analysis , let alone the next one we propose . [SEP]",
 "[CLS] one more pseudo generalization and i 'm giving up . [SEP]",
 "[CLS] one more pseudo generalization or i 'm giving up . [SEP]",
 '[CLS] the more we study verbs , the crazier they get . [SEP]',
 '[CLS] day by day the facts are getting murkier . [SEP]',
 "[CLS] i 'll fix you a drink . [SEP]",
 '[CLS] fred watered the plants flat . [SEP]',
 '[CLS] bill coughed his way out of the restaurant . [SEP]',
 "[CLS] we 're dancing the night away . [SEP]",
 '[CLS] herman hammered the metal flat . [SEP]']

In [30]:
#@title Activating the BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print ("Tokenize the first sentence:")
print (tokenized_texts[0])


Tokenize the first sentence:
['[CLS]', 'our', 'friends', 'wo', 'n', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']
