# BERT

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.5 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 32.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 5.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/techsoc-analytics-21-22/data/train.csv")


print('Number of training sentences: {:,}\n'.format(df.shape[0]))

df.sample(10)

Number of training sentences: 35,112



Unnamed: 0,content,title,uid,target_ind
31237,Look no further than the Propet Life Walker sn...,Propet Men's Life Walker Sneaker,B000BVZYPE,302
2292,"Johann and Robin, a young gay couple, travel t...",Light Gradient (2010),B003YMR9IC,455
25512,SPECIAL EDITION FEATURES: - Digitally remaster...,Polymorph (Special Edition),B0007A108Y,477
34403,Based on the chilling bestseller by Stephen Ki...,Misery [VHS] (1990),6305074380,40
10274,The pyramid has always held a strange fascinat...,Secret Power of Pyramids,0449132668,428
22600,Rod and Katherine Wright are enjoying their re...,A Golden Christmas 2: The Second Tail [Blu-ray],B008IG0EWM,486
29024,"This silicone O-ring is red in color, round in...","-028 Silicone O-Ring, 70A Durometer, Red, 1-3/...",B000FMYNXI,348
19516,9 piece set includes hex driver sockets 2mm th...,VIM Hand Tools SHM400 9 Pc. Metric Stubby Hex ...,B0002UNOME,372
353,Malice Spreads Like DiseaseYosuke Kobayashi a ...,MPD Psycho: The Complete Miniseries (2006),B000I0QLSW,478
638,Our finest shirt fabric: Imperial 100s cotton ...,Amazon.com: Imperial 100s European Straight Co...,B00008JP3W,131


In [None]:
df['info'] =df['title'] + df['content']
sentences = df['info'].values
labels = df['target_ind'].values

In [None]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch

input_ids = []
attention_masks = []


for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 256,           
                        padding = 'max_length',
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    
    attention_masks.append(encoded_dict['attention_mask'])


input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)


print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  Amazon.com: Wrangler Men's Rugged Wear Relaxed Fit Jean: ClothingPremium quality five pocket jean from Wrangler Rugged Wear. This Relaxed Fit Jean is made from 100% cotton denim for durability with extra room in the seat and thigh for comfort.	Men's Wrangler Trail Trekker Relaxed Fit Jeans Set out on a long hike, or kick back for an afternoon full of watching college football from the comfort of your own home. These Wrangler Trail Trekker Relaxed Fit Jeans are up for anything you are! Check 'em out: 100% cotton denim construction; Relaxed 5 pocket style; Easy entry, extra deep front pockets; Solid brass YKK zip fly; Leather waistband patch; Fit easily over boots; Machine wash / dry. Imported. State Color and Size! Get yours today! Men's Wrangler 36" Inseam Trail Trekker Relaxed Fit Jeans
Token IDs: tensor([  101,  9733,  1012,  4012,  1024, 23277,  5654,  3917,  2273,  1005,
         1055, 17638,  4929,  8363,  4906,  3744,  1024,  5929, 28139, 27759,
         3737,  2274,  

In [None]:
from torch.utils.data import TensorDataset, random_split


dataset = TensorDataset(input_ids, attention_masks, labels)



train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size


train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

31,600 training samples
3,512 validation samples


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


batch_size = 16


train_dataloader = DataLoader(
            train_dataset, 
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
        )

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig


model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 500, 
    output_attentions = False, 
    output_hidden_states = False, 
)

model.cuda()

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8 
                )
from transformers import get_linear_schedule_with_warmup


epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [None]:
import numpy as np


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):

    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import random
import numpy as np


seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


training_stats = []


total_t0 = time.time()
model.load_state_dict(torch.load('/content/drive/MyDrive/techsoc-analytics-21-22/weights/bert/weights_bert_epochs7.pth'))

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

 
    t0 = time.time()

   
    total_train_loss = 0
    total_train_accuracy = []

    model.train()

    for step, batch in enumerate(train_dataloader):


        if step % 40 == 0 and not step == 0:
         
            elapsed = format_time(time.time() - t0)
            
      
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print(f'acc = {(np.mean(total_train_accuracy))}')


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

 
        model.zero_grad()        


        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits


        total_train_loss += loss.item()

        loss.backward()


        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


        optimizer.step()

       
        scheduler.step()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        total_train_accuracy.append(flat_accuracy(logits, label_ids))
    
  
    avg_train_loss = total_train_loss / len(train_dataloader)            
    

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()


    model.eval()


    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0


    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        

        with torch.no_grad():        

 
            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)


        loss = result.loss
        logits = result.logits
            
  
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

  
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

   
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
   
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of  1,975.    Elapsed: 0:00:52.
acc = 0.5546875
  Batch    80  of  1,975.    Elapsed: 0:01:44.
acc = 0.55078125
  Batch   120  of  1,975.    Elapsed: 0:02:36.
acc = 0.553125
  Batch   160  of  1,975.    Elapsed: 0:03:28.
acc = 0.5515625
  Batch   200  of  1,975.    Elapsed: 0:04:20.
acc = 0.5515625
  Batch   240  of  1,975.    Elapsed: 0:05:12.
acc = 0.5484375
  Batch   280  of  1,975.    Elapsed: 0:06:04.
acc = 0.553125
  Batch   320  of  1,975.    Elapsed: 0:06:56.
acc = 0.551953125
  Batch   360  of  1,975.    Elapsed: 0:07:48.
acc = 0.5486111111111112
  Batch   400  of  1,975.    Elapsed: 0:08:40.
acc = 0.54703125
  Batch   440  of  1,975.    Elapsed: 0:09:32.
acc = 0.5492897727272728
  Batch   480  of  1,975.    Elapsed: 0:10:24.
acc = 0.5502604166666667
  Batch   520  of  1,975.    Elapsed: 0:11:16.
acc = 0.5501201923076923
  Batch   560  of  1,975.    Elapsed: 0:12:08.
acc = 0.5474330357142857
  Batch   600  of  1,975.    Elapsed: 0:13:01.
acc = 0.548

KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/techsoc-analytics-21-22/weights/bert/weights_bert_epochs8.pth')

In [None]:
from torch.utils.data import TensorDataset, random_split
import pandas as pd


df = pd.read_csv("/content/drive/MyDrive/techsoc-analytics-21-22/data/test.csv")


print('Number of test sentences: {:,}\n'.format(df.shape[0]))
df['info'] = df['title'] + df['content']

sentences = df['info'].values
labels = [0]*len(df)


input_ids = []
attention_masks = []


for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 256,           
                        padding = 'max_length',
                        truncation = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt',     
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

 
batch_size = 16 


prediction_data = TensorDataset(input_ids, attention_masks, labels)

prediction_dataloader = DataLoader(prediction_data,  batch_size=batch_size, shuffle = False)

Number of test sentences: 8,106



In [None]:

import torch.nn.functional as F
model.load_state_dict(torch.load('/content/drive/MyDrive/techsoc-analytics-21-22/weights/bert/weights_bert_epochs8.pth'))
print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))


model.eval()


predictions , true_labels = [], []

for batch in prediction_dataloader:
  
  batch = tuple(t.to(device) for t in batch)
  
  
  b_input_ids, b_input_mask, b_labels = batch
  

  with torch.no_grad():
     
      result = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  
  predictions.append(F.softmax(logits, dim = 1))
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 8,106 test sentences...
    DONE.


In [None]:
probs_bert = torch.cat(predictions)
probs_bert = np.array(probs_bert)
preds_bert = np.argmax(probs_bert, axis = 1)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/techsoc-analytics-21-22/data/test.csv")
submission_weights = pd.DataFrame({'uid': [t for t in df['uid']], 'target_ind': preds_bert})
submission_weights.to_csv('submission_bert_8.csv')