In [1]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil --q
!pip install psutil --q
!pip install humanize --q
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

  Building wheel for gputil (setup.py) ... [?25l[?25hdone
Gen RAM Free: 12.3 GB  | Proc size: 96.4 MB
GPU RAM Free: 15109MB | Used: 0MB | Util   0% | Total 15109MB


In [2]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive/"     # default location for the drive
drive.mount(ROOT)           # we mount the google drive at /content/drive

Mounted at /content/drive/


In [3]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [4]:
!pip install transformers --q

[K     |████████████████████████████████| 4.9 MB 7.9 MB/s 
[K     |████████████████████████████████| 120 kB 65.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 45.0 MB/s 
[?25h

In [6]:
import pandas as pd

train = pd.read_csv("train.csv", usecols=["text_a", "label"])
test = pd.read_csv("test.csv")
print(train)
print(test)

                                                  text_a label
0      betewe buka twitter cuman ngetweet liat home b...    no
1      mas piyuuu mugo2 corona tuh mulut tersumpal ma...    no
2      e100ss gini buka informasi sejelas nya identit...   yes
3      neng solo wes ono terduga corona cobo neng ati...    no
4      midiahn nii akun gak takut takut nya isu coron...    no
...                                                  ...   ...
21596  depok panas ga karuan kereta sampe pasming huj...    no
21597  oxfara arie kriting yg lebi goblo nya orang ke...    no
21598  virus corona menyaba depok cuci tangan makan n...    no
21599  mata sipit tinggal depok udah abis dah bahan c...    no
21600       i ak batuk pilek pusing demam anjir ak depok    no

[21601 rows x 2 columns]
                                                 text_a label
0                               jek dajal ga depok bang    no
1     detikcom untung depok masuk wilayah nya ridwan...    no
2     df dom jakarta depok yg gu

In [7]:
from sklearn import preprocessing

# dict mapping
labels = ["no", "yes"]
le = preprocessing.LabelEncoder()
le.fit(labels)

train["label"] = le.transform(train["label"])
test["label"] = le.transform(test["label"])
print(train)
print(test)

                                                  text_a  label
0      betewe buka twitter cuman ngetweet liat home b...      0
1      mas piyuuu mugo2 corona tuh mulut tersumpal ma...      0
2      e100ss gini buka informasi sejelas nya identit...      1
3      neng solo wes ono terduga corona cobo neng ati...      0
4      midiahn nii akun gak takut takut nya isu coron...      0
...                                                  ...    ...
21596  depok panas ga karuan kereta sampe pasming huj...      0
21597  oxfara arie kriting yg lebi goblo nya orang ke...      0
21598  virus corona menyaba depok cuci tangan makan n...      0
21599  mata sipit tinggal depok udah abis dah bahan c...      0
21600       i ak batuk pilek pusing demam anjir ak depok      0

[21601 rows x 2 columns]
                                                 text_a  label
0                               jek dajal ga depok bang      0
1     detikcom untung depok masuk wilayah nya ridwan...      0
2     df dom jaka

In [8]:
from nltk.tokenize import word_tokenize
import nltk
import collections
nltk.download('punkt')

def get_frequent_word(df):
    text = " ".join(list(df['text_a'].str.lower()))
    word_list = word_tokenize(text)
    word_count = dict(collections.Counter(word_list))
    d_word_freq = pd.DataFrame(data = {'word': list(word_count.keys()), 'freq': list(word_count.values())})
    
    return d_word_freq

def cleansing(text, stopword = None):
    word_list = word_tokenize(text.lower())
    word_list = [word for word in word_list if len(word) > 2]
    word_list = [word for word in word_list if word.isalnum()]
    if stopword == None:
        text = ' '.join(word_list)
    else:
        word_list = [word for word in word_list if word not in stopword]
        text = ' '.join(word_list)
                
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
d_w_f = get_frequent_word(train)
# Check alphanumeric
d_w_f['is_alnum'] = d_w_f.word.str.isalnum()
# select only alphanumeric word (ignore punctuation)
d_w_f_selected = d_w_f[d_w_f['is_alnum'] == True].sort_values(by = 'freq', ascending = False)
print(d_w_f_selected.head(15))
print(d_w_f_selected.tail(15))

            word  freq  is_alnum
8         corona  8172      True
36           nya  4813      True
588            t  4371      True
587        https  4347      True
589           co  4314      True
13            yg  4011      True
238        covid  2768      True
171        virus  2662      True
239           19  2622      True
677        depok  2068      True
3341        psbb  1987      True
146           ya  1873      True
3708  distancing  1760      True
118           ga  1537      True
15           aja  1508      True
                 word  freq  is_alnum
15843      nlmz7ib4od     1      True
29082           loker     1      True
29081     audyaulidya     1      True
29078              gc     1      True
15848      ulangtahun     1      True
29077       ngerandom     1      True
15846      sejenisnya     1      True
29075       sometimes     1      True
29074            ixyg     1      True
29073           picik     1      True
15847  omyangbaikhati     1      True
29071         ke

In [51]:
# Create stopwords list
stopwords = list(d_w_f_selected[(d_w_f_selected['freq'].between(4000, 5000)) | (d_w_f_selected['freq'] < 2)].word)
custom_stopwords = ['ya', 'aja', 'yg', 'sih', 'ga', 'gak','gy']
stopwords = stopwords + custom_stopwords
print(stopwords[:50])
print("stopwords count: {}".format(len(stopwords)))

['nya', 't', 'https', 'co', 'yg', 'angele', 'gakepikiran', 'tarap', 'jannotama', 'titah', 'perantara', 'ck', 'dimirip2kn', 'siamini', 'maranatha', 'sbln', 'yudiistiono09', 'tabliki', 'gambarx', 'delok2', 'ngalahkan', 'hndak', 'immaranatha', 'r2bvpvizoc', 'vrpelelxyx', 'ypdg', 'direken', 'sistemik', 'sambal', 'zzasqia', 'memamg', 'sebarannya', 'ciao', 'mop', 'scanerio', 'viruspun', 'fadliyzon', 'brsma', 'maxiswecare', 'kkrytqxcyk', 'kegagaln', '7xk2fkwel2', 'kebijakanpemimpindiktator', 'ckck', 'jamat', 'nyari2', 'budiadiputro', 'undisclosedonny', 'kuk', 'tikar']
stopwords count: 23575


In [11]:
from tqdm import tqdm

for i in tqdm(range(len(train))):
    train.loc[i, 'text_cleansing'] = cleansing(train.loc[i, 'text_a'], stopword=stopwords)

for i in tqdm(range(len(test))):
    test.loc[i, 'text_cleansing'] = cleansing(test.loc[i, 'text_a'], stopword=stopwords)

print(train)
print(test)

100%|██████████| 21601/21601 [02:48<00:00, 128.44it/s]
100%|██████████| 2800/2800 [00:19<00:00, 145.37it/s]

                                                  text_a  label  \
0      betewe buka twitter cuman ngetweet liat home b...      0   
1      mas piyuuu mugo2 corona tuh mulut tersumpal ma...      0   
2      e100ss gini buka informasi sejelas nya identit...      1   
3      neng solo wes ono terduga corona cobo neng ati...      0   
4      midiahn nii akun gak takut takut nya isu coron...      0   
...                                                  ...    ...   
21596  depok panas ga karuan kereta sampe pasming huj...      0   
21597  oxfara arie kriting yg lebi goblo nya orang ke...      0   
21598  virus corona menyaba depok cuci tangan makan n...      0   
21599  mata sipit tinggal depok udah abis dah bahan c...      0   
21600       i ak batuk pilek pusing demam anjir ak depok      0   

                                          text_cleansing  
0      betewe buka twitter cuman ngetweet liat home b...  
1      mas piyuuu mugo2 corona tuh mulut tersumpal co...  
2      e100ss gini




In [12]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Loading BERT tokenizer...


Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [13]:
import statistics
sent_length = []

# For every sentence...
for sentence in train["text_cleansing"]:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sentence, add_special_tokens=True)
    sent_length.append(len(input_ids))

print('Average length = ', sum(sent_length)/len(sent_length))
print('Median length = ', statistics.median(sent_length))
print('Max length = ', max(sent_length))

Token indices sequence length is longer than the specified maximum sequence length for this model (1003 > 512). Running this sequence through the model will result in indexing errors


Average length =  26.888801444377574
Median length =  23
Max length =  1003


In [14]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in train["text_cleansing"]:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train["label"])

# Print sentence 0, now as a list of IDs.
print('Original: ', train["text_cleansing"][0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  betewe buka twitter cuman ngetweet liat home berita corona panik kepikiran ndamau buka2 home aware stay home nda rumah kalo nda penting2 banget
Token IDs: tensor([  101, 13009, 26127, 10112, 11499, 10371,   188, 56082, 10877, 16008,
        10206, 10743, 10308, 12577, 10308, 11614, 10526, 11816, 84836, 31206,
        97586, 10174, 11163, 70583, 27974, 24477, 89793, 11499, 10371, 10729,
        11816, 66625, 29597, 11816, 24477, 22740, 10730, 10715, 24477, 34162,
        10729, 17937, 10308,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     

In [15]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 80-20 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('training samples count: {}'.format(train_size))
print('validation samples count: {}'.format(val_size))

training samples count: 17280
validation samples count: 4321


In [16]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [32]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased', # Use the 12-layer BERT model, with an cased vocab.
    num_labels = 2, 
    output_attentions = False, # return attentions weights
    output_hidden_states = False, # returns all hidden-states
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [33]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (119547, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              

In [34]:
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8
                )

epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
print('Jumlah batch :', len(train_dataloader))
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Jumlah batch : 540




In [35]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [36]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [39]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# List variable for store training and validation loss, validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 20 batches.
        if step % 20 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a backward pass
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # token_type_ids is same as the "segment ids", which differentiates 
        # sentence 1 and 2 in sentence-pair tasks
        b_model = model(b_input_ids, 
                             token_type_ids=None,
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        loss = b_model.loss
        logits = b_model.logits

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. 
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode (batchnorm, dropout disable)
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Deactivate autograd, it will reduce memory usage and speed up computations
        # but you won’t be able to backprop (which you don’t want in an eval script).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            b_model = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        loss = b_model.loss
        logits = b_model.logits
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'Epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Validation Loss': avg_val_loss,
            'Validation Accuracy': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    20  of    540.    Elapsed: 0:00:25.
  Batch    40  of    540.    Elapsed: 0:00:50.
  Batch    60  of    540.    Elapsed: 0:01:16.
  Batch    80  of    540.    Elapsed: 0:01:42.
  Batch   100  of    540.    Elapsed: 0:02:08.
  Batch   120  of    540.    Elapsed: 0:02:34.
  Batch   140  of    540.    Elapsed: 0:03:01.
  Batch   160  of    540.    Elapsed: 0:03:27.
  Batch   180  of    540.    Elapsed: 0:03:54.
  Batch   200  of    540.    Elapsed: 0:04:21.
  Batch   220  of    540.    Elapsed: 0:04:48.
  Batch   240  of    540.    Elapsed: 0:05:14.
  Batch   260  of    540.    Elapsed: 0:05:41.
  Batch   280  of    540.    Elapsed: 0:06:08.
  Batch   300  of    540.    Elapsed: 0:06:35.
  Batch   320  of    540.    Elapsed: 0:07:01.
  Batch   340  of    540.    Elapsed: 0:07:28.
  Batch   360  of    540.    Elapsed: 0:07:55.
  Batch   380  of    540.    Elapsed: 0:08:22.
  Batch   400  of    540.    Elapsed: 0:08:48.
  Batch   420  of    540.    Elapsed: 0:09:15.


In [40]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('Epoch')

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Validation Loss,Validation Accuracy,Training Time,Validation Time
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.31,0.32,0.87,0:11:55,0:01:07
2,0.24,0.34,0.88,0:12:01,0:01:06
3,0.21,0.34,0.88,0:12:01,0:01:06


In [47]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

test_sentences = test["text_cleansing"]
test_labels = test["label"]

# For every sentence...
for sent in test_sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(test_labels)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)



In [52]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 2,800 test sentences...
    DONE.


In [53]:
# Calculate accuracy for test dataset
total_accuracy = 0
for batch_num in range(len(predictions)): 
  total_accuracy += flat_accuracy(predictions[batch_num], true_labels[batch_num])

total_accuracy = total_accuracy/len(predictions)
print('Accuracy on test dataset: {}'.format(total_accuracy))

Accuracy on test dataset: 0.8490767045454546


In [46]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = 'drive/My Drive/DSI/model/'

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to drive/My Drive/DSI/model/


('drive/My Drive/DSI/model/tokenizer_config.json',
 'drive/My Drive/DSI/model/special_tokens_map.json',
 'drive/My Drive/DSI/model/vocab.txt',
 'drive/My Drive/DSI/model/added_tokens.json')

In [None]:
# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)