In [1]:
!pip install transformers
!pip install shap
!pip install transformers_interpret
import transformers_interpret
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import transformers
import torch
from torch.utils.data import Dataset, DataLoader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in in

In [2]:
df = pd.read_csv("/content/drive/MyDrive/pro/8701/combined_14510_xlnet.csv")
title_only = df[(df['title_polyglot_detect'] == 'en') & (df['title_lang_detect'] == 'en') & (df['title_langid_detect'] == 'en') & (df['title_xl_detect'] == 'en')][['question_title', 'class_index']]

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
label2id = {
    1: 0,
    5: 1,
    6: 2,
    10: 3
}

id2label = {
    0: 1,
    1: 5,
    2: 6,
    3: 10
}
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [label2id[label] for label in df['class_index']]
        self.texts = [tokenizer(text, add_special_tokens = True,
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['question_title']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
np.random.seed(112)
df_train, df_val, df_test = np.split(title_only.sample(frac=1, random_state=42), 
                                     [int(.8*len(title_only)), int(.9*len(title_only))])

print(len(df_train),len(df_val), len(df_test)) #dataframes

13100 1637 1638


In [6]:
class BertClassifier(nn.Module):

    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 4, label2id=label2id, id2label=id2label)
        # self.linear = nn.Linear(768, 4)
 
    def forward(self, input_id, mask):
        pooled_output = self.bert(input_ids= input_id, attention_mask=mask)
        return pooled_output

    def save_model(self, path, tokenizer): #'/content/drive/MyDrive/pro/8701/roberta_model_content/'
        self.bert.save_pretrained(path)
        tokenizer.save_pretrained(path)

In [7]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    train_acc = []
    valid_acc = []

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output.logits, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.logits.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output.logits, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.logits.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            
            train_acc.append(total_acc_train / len(train_data))
            valid_acc.append(total_acc_val / len(val_data))
            
            # path = '/content/drive/MyDrive/pro/8701/bert_model/saved_weights_' + str(epoch_num) +'.pt'
            # print(path)
            # torch.save(model.state_dict(), path)
            
    return train_acc, valid_acc           
                  
EPOCHS = 4
model = BertClassifier()
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)
# train(model, df_train.iloc[:10], df_val.iloc[:8], LR, EPOCHS)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epochs: 1 | Train Loss:  0.056                 | Train Accuracy:  0.842                 | Val Loss:  0.041                 | Val Accuracy:  0.892


100%|██████████| 1638/1638 [19:10<00:00,  1.42it/s]


Epochs: 2 | Train Loss:  0.025                 | Train Accuracy:  0.934                 | Val Loss:  0.042                 | Val Accuracy:  0.896


100%|██████████| 1638/1638 [19:10<00:00,  1.42it/s]


Epochs: 3 | Train Loss:  0.011                 | Train Accuracy:  0.976                 | Val Loss:  0.050                 | Val Accuracy:  0.889


  3%|▎         | 56/1638 [00:40<18:54,  1.39it/s]


KeyboardInterrupt: ignored

In [8]:
model.save_model(path = '/content/drive/MyDrive/pro/8701/bert_model/', tokenizer=tokenizer)

In [None]:
model = BertClassifier()
model = model.to(device)
model

In [37]:
from torch.optim import Adam
from tqdm import tqdm

train, val = Dataset(df_train[:2]), Dataset(df_val[:2])
train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr= 0.05)

total_acc_train = 0
total_loss_train = 0
for train_input, train_label in tqdm(train_dataloader):
  train_label = train_label.to(device)
  mask = train_input['attention_mask'].to(device)
  input_id = train_input['input_ids'].squeeze(1).to(device)
  # print(mask.shape)
  # print(input_id.shape)
  output = model(input_id, mask)
  print(" ")
  print(output)
  print(type(output[0]))
  # print(output.keys())
  print(output)
  output = output.logits
  batch_loss = criterion(output, train_label.long())
  total_loss_train += batch_loss.item()
  acc = (output.argmax(dim=1) == train_label).sum().item()
  total_acc_train += acc
  model.zero_grad()
  batch_loss.backward()
  optimizer.step()

100%|██████████| 1/1 [00:00<00:00,  5.11it/s]

 
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1575,  0.6411,  0.0626, -0.2961],
        [ 0.1140,  0.7365,  0.3078, -0.4516]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
<class 'torch.Tensor'>
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1575,  0.6411,  0.0626, -0.2961],
        [ 0.1140,  0.7365,  0.3078, -0.4516]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)





In [11]:
def evaluate(model, test_data):

    test = Dataset(test_data)
    pred_label = []
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)
              pred = output.logits.argmax(dim=1)
              # print(pred)
              # print(test_label)
              # pred_label.append(pred)
              acc = (output.logits.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return pred_label


#load weights of best model
model2 = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/pro/8701/bert_model')
tokenizer2 = AutoTokenizer.from_pretrained('/content/drive/MyDrive/pro/8701/bert_model')

evaluate(model2, df_test)
# evaluate(model_test, df_train.iloc[:10])

Test Accuracy:  0.871


[]

In [21]:
# model2 = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/pro/8701/bert_model')
# tokenizer2 = AutoTokenizer.from_pretrained('/content/drive/MyDrive/pro/8701/bert_model')
from transformers_interpret import SequenceClassificationExplainer
multiclass_explainer = SequenceClassificationExplainer(model=model2, tokenizer=tokenizer2)
text=df['question_title'].iloc[10001]
print(text)
word_attributions = multiclass_explainer(text)
print(multiclass_explainer.predicted_class_name)
num_eng = {
    1: 'culture',
    5: 'computers',
    6: 'sports',
    10: 'political'
}
num_eng[multiclass_explainer.predicted_class_name]

who invented the internet?
5


'computers'

In [22]:
multiclass_explainer.visualize()

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,5 (1.00),5.0,0.7,[CLS] who invented the internet ? [SEP]
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,5 (1.00),5.0,0.7,[CLS] who invented the internet ? [SEP]
,,,,
