<a href="https://colab.research.google.com/github/abyaadrafid/LDA_Lab_Defence/blob/main/BertClaudette.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.optim import Adam

In [None]:
from tqdm import tqdm_notebook

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def get_sentences(path):
    sentences= []
    for filename in os.listdir(path):
        with open(path+filename, 'r') as f:
            for sentence in f :
                sentences.append(sentence)
    return sentences

In [None]:
def get_labels(path):
    all_labels = []
    for filename in os.listdir(path):
        file_labels = []
        with open(path+filename, 'r') as f:
            for label in f :
                all_labels.append(int(label))
    return all_labels

In [None]:
all_sentences = get_sentences("/content/drive/MyDrive/Sentences/")

In [None]:
all_labels = get_labels("/content/drive/MyDrive/Labels/")

In [None]:
all_labels =  [0 if label ==-1 else label for label in all_labels]

In [None]:
labels_df = pd.DataFrame(all_labels, columns=['label'])

In [None]:
train_sen, valid_sen, train_label, valid_label = train_test_split(all_sentences, all_labels)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 30.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 6.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self,sentences ,labels):

        self.labels = labels
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in tqdm_notebook(all_sentences)]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
train_ds = Dataset(train_sen, train_label)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/9414 [00:00<?, ?it/s]

In [None]:
valid_ds = Dataset(valid_sen, valid_label)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/9414 [00:00<?, ?it/s]

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.relu((self.linear(dropout_output)))

        return linear_output

In [None]:
use_cuda = torch.cuda.is_available()

In [None]:
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
model = BertClassifier().to(device)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=16)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr= 1e-6)

In [None]:
def train_loop(n_epochs) :
  tr_loss = []
  tr_acc = []
  va_loss = []
  va_acc = [] 
  for epoch in tqdm_notebook(range(n_epochs)) :
    train_loss = 0.0
    train_acc = 0.0

    for inputs, targets in tqdm_notebook(train_dl) :
      targets = targets.to(device)
      masks = inputs['attention_mask'].to(device)
      input_ids = inputs['input_ids'].squeeze(1).to(device)
      outputs = model(input_ids, masks)

      loss = criterion(outputs, targets)
      train_loss += loss.item()
      acc = (outputs.argmax(dim=1) == targets).sum().item()
      train_acc += acc

      model.zero_grad()
      loss.backward()
      optimizer.step()

    valid_loss = 0.0
    valid_acc = 0.0

    with torch.no_grad():
      for inputs, targets in tqdm_notebook(valid_dl) :
        targets = targets.to(device)
        masks = inputs['attention_mask'].to(device)
        input_ids = inputs['input_ids'].squeeze(1).to(device)
        outputs = model(input_ids, masks)

        loss = criterion(outputs, targets)
        valid_loss += loss.item()

        acc = (outputs.argmax(dim=1) == targets).sum().item()
        valid_acc += acc
    print(
        f'Epoch : {epoch+1} , Train_loss : {train_loss/len(train_ds)}, Train_acc : {train_acc/len(train_ds)} ,Valid_loss : {valid_loss/len(valid_ds)}, Valid_acc : {valid_acc/len(valid_ds)}'
      )

    tr_loss.append(train_loss/len(train_ds))
    tr_acc.append(train_acc/len(train_ds))
    va_loss.append(valid_loss/len(valid_ds))
    va_acc.append(valid_acc/len(valid_ds))

  return tr_loss, tr_acc, va_loss, va_acc

In [None]:
train_loop(2)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/442 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/148 [00:00<?, ?it/s]

Epoch : 1 , Train_loss : 0.02191351954389902, Train_acc : 0.8923512747875354 ,Valid_loss : 0.02301172720950442, Valid_acc : 0.8840271877655055


  0%|          | 0/442 [00:00<?, ?it/s]

  0%|          | 0/148 [00:00<?, ?it/s]

Epoch : 2 , Train_loss : 0.021536168496179852, Train_acc : 0.8924929178470254 ,Valid_loss : 0.022853253983727125, Valid_acc : 0.8840271877655055


([0.02191351954389902, 0.021536168496179852],
 [0.8923512747875354, 0.8924929178470254],
 [0.02301172720950442, 0.022853253983727125],
 [0.8840271877655055, 0.8840271877655055])