In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
! pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW


In [5]:
# Load the dataset
df_train = pd.read_csv("/content/drive/MyDrive/NEEWWWWW/DATA_NEW/SST2/500/train_400.csv")
df_test = pd.read_csv("/content/drive/MyDrive/NEEWWWWW/DATA_NEW/SST2/500/test_100.csv")

In [6]:
# Define a custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.loc[index, "sentence"])
        label = self.data.loc[index, "label"]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [7]:
class BertSentimentClassifier(torch.nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertSentimentClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained("bert-base-uncased")

        if freeze_bert:
            for param in self.bert_layer.parameters():
                param.requires_grad = False

        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 2)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert_layer(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        pooled_output = bert_outputs[1]
        dropout_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_output)

        return logits

    def count_parameters(self):
        total_params = 0
        for layer_idx, layer in enumerate(self.bert_layer.encoder.layer):
            layer_params = sum(p.numel() for p in layer.parameters())
            print(f"Layer {layer_idx+1}: {layer_params}")
            total_params += layer_params

        print(f"Total parameters: {total_params}")

In [8]:
model = BertSentimentClassifier(freeze_bert=False)
model.count_parameters()

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Layer 1: 7087872
Layer 2: 7087872
Layer 3: 7087872
Layer 4: 7087872
Layer 5: 7087872
Layer 6: 7087872
Layer 7: 7087872
Layer 8: 7087872
Layer 9: 7087872
Layer 10: 7087872
Layer 11: 7087872
Layer 12: 7087872
Total parameters: 85054464


In [9]:
model = BertSentimentClassifier(freeze_bert=False)
layer_params = []
for layer_idx, layer in enumerate(model.bert_layer.encoder.layer):
    layer_params.append(sum(p.numel() for p in layer.parameters()))

for i in range(0, len(layer_params), 12):
    start_idx = i
    end_idx = min(i+12, len(layer_params))
    total_word_repr = sum(layer_params[start_idx:end_idx])
    print(f"Layers {start_idx+1}-{end_idx}: {total_word_repr}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Layers 1-12: 85054464


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# Define a function to extract the contextualized word representations for every word in every layer
def get_contextualized_word_representations(input_text):
    input_ids = torch.tensor(tokenizer.encode(input_text, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    all_layers = outputs.last_hidden_state
    contextualized_word_representations = []
    for layer in all_layers:
        layer_representations = []
        for token_index in range(len(input_text.split(" "))+2):
            # The +2 is because we added [CLS] and [SEP] tokens to the input_ids
            token_representation = layer[token_index].detach().numpy()
            layer_representations.append(token_representation)
        contextualized_word_representations.append(layer_representations)
    return contextualized_word_representations

# Extract the contextualized word representations for every word in every layer for each sentence in the training dataset
train_contextualized_word_representations = []
for sentence in df_train["text"]:
    sentence_contextualized_word_representations = get_contextualized_word_representations(sentence)
    train_contextualized_word_representations.append(sentence_contextualized_word_representations)

# Extract the contextualized word representations for every word in every layer for each sentence in the test dataset
test_contextualized_word_representations = []
for sentence in df_test["text"]:
    sentence_contextualized_word_representations = get_contextualized_word_representations(sentence)
    test_contextualized_word_representations.append(sentence_contextualized_word_representations)

In [None]:
# Define a function to extract the contextualized word representations for every word in every layer
def get_contextualized_word_representations(input_text):
    input_ids = torch.tensor(tokenizer.encode(input_text, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    all_layers = outputs.last_hidden_state
    contextualized_word_representations = []
    for layer in all_layers:
        layer_representations = []
        for token_index in range(len(input_text.split(" "))+2):
            # The +2 is because we added [CLS] and [SEP] tokens to the input_ids
            token_representation = layer[token_index].detach().numpy()
            layer_representations.append(token_representation)
        contextualized_word_representations.append(layer_representations)
    return contextualized_word_representations

# Extract the contextualized word representations for every word in every layer for each sentence in the training dataset
train_contextualized_word_representations = []
for sentence in df_train["text"]:
    sentence_contextualized_word_representations = get_contextualized_word_representations(sentence)
    train_contextualized_word_representations.append(sentence_contextualized_word_representations)

# Extract the contextualized word representations for every word in every layer for each sentence in the test dataset
test_contextualized_word_representations = []
for sentence in df_test["text"]:
    sentence_contextualized_word_representations = get_contextualized_word_representations(sentence)
    test_contextualized_word_representations.append(sentence_contextualized_word_representations)

# Print the extracted contextualized word representations for every word in every layer for each sentence in the training dataset
for i, sentence_contextualized_word_representations in enumerate(train_contextualized_word_representations):
    print(f"Training Sentence {i + 1}: {df_train['text'][i]}")
    for layer_index, layer in enumerate(sentence_contextualized_word_representations):
        print(f"Layer {layer_index + 1}:")
        for token_index, token_representation in enumerate(layer):
            token = tokenizer.convert_ids_to_tokens(tokenizer.encode(df_train["text"][i], add_special_tokens=True))[token_index]
            print(f"Token {token_index}: {token}")
            print(f"Contextualized Word Representation: {token_representation}")
            print()

# Print the extracted contextualized word representations for every word in every layer for each sentence in the test dataset
for i, sentence_contextualized_word_representations in enumerate(test_contextualized_word_representations):
    print(f"Test Sentence {i + 1}: {df_test['text'][i]}")
    for layer_index, layer in enumerate(sentence_contextualized_word_representations):
        print(f"Layer {layer_index + 1}:")
        for token_index, token_representation in enumerate(layer):
            token = tokenizer.convert_ids_to_tokens(tokenizer.encode(df_test["text"][i], add_special_tokens=True))[token_index]
            print(f"Token {token_index}: {token}")
            print(f"Contextualized Word Representation: {token_representation}")
            print()