In [1]:
from transformers import (
    BertModel, 
    AutoConfig, 
    AutoTokenizer, 
    Trainer,
    TrainingArguments
    )
import torch.nn as nn
import datasets
import csv

In [2]:
import torch

torch.cuda.is_available()

True

In [3]:

model_name = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

config.problem_type = 'regression'

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:

# Add linear layer
output_size = 1  


# Combine BERT and the linear layer
class BertWithLinear(nn.Module):
    def __init__(self):
        super(BertWithLinear, self).__init__()
        self.bert = bert_model.to('cuda')
        self.ft = nn.Sequential(
            nn.Linear(config.hidden_size, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, output_size)


        ).to('cuda')
        

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)
        # Use pooled output for classification/regression
        pooled_output = output.pooler_output
        return self.ft(pooled_output)

model = BertWithLinear().to('cuda')

In [40]:
# freeze BERT pretrained weights
for param in model.bert.embeddings.parameters():
    param.requires_grad = False

In [43]:
# Load dataset
feats_fp = open("BERT_X.csv", "r")
labels_fp = open("BERT_y.csv", "r")
feats = csv.reader(feats_fp)
labels = csv.reader(labels_fp)

# skip header
next(feats)
next(labels)

data = {'text': [], 'label': []}
for row in feats:
    data['text'].append(row[0].strip().replace("\n", " "))
for row in labels:
    data['label'].append(float(row[0].strip().replace("\n", "")))
print(len(data['text']), len(data['label']))
assert len(data['text']) == len(data['label'])
dataset = datasets.Dataset.from_dict(data)


26990 26990


In [7]:
dataset


Dataset({
    features: ['text', 'label'],
    num_rows: 26990
})

In [8]:
print(dataset[0]['text'])
print(dataset[0]['label'])

Overview  HearingLife is a national hearing care company and part of the Demant Group, a global leader in hearing healthcare built on a heritage of care, health, and innovation since 1904. HearingLife operates more than 600 hearing care centers across 42 states. We follow a scientific, results-oriented approach to hearing healthcare that is provided by highly skilled and caring professionals. Our vision is to help more people hear better through life-changing hearing health delivered by the best personalized care. This Team Member must uphold the HearingLife Core Values:   We create trust  We are team players  We apply a can-do attitude  We create innovative solutions   Responsibilities  You will help more people hear better by providing clinical expertise to diagnose and treat hearing loss while ensuring a positive patient experience. The Hearing Care Provider acts in accordance with required industry and state professional licensing standards and local practice scope and is responsib

In [9]:
# max salary

max_salary = max(data['label'])
min_salary = min(data['label'])
print(max_salary, min_salary)


285000.0 22000.0


In [15]:
# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess, batched=True)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /mnt/home/bhatta70/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncas

  0%|          | 0/27 [00:00<?, ?ba/s]

In [28]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    learning_rate=5e-5,               
    warmup_steps=500,                
    weight_decay=0.01,              
    logging_dir='./logs',
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [31]:

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_dataset,    
    eval_dataset=tokenized_dataset,
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertWithLinear.forward` and have been ignored: text. If text are not expected by `BertWithLinear.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 26990
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 633


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/mnt/home/bhatta70/.local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/mnt/home/bhatta70/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<ipython-input-26-96aa96028594>", line 27, in forward
    output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)
  File "/mnt/home/bhatta70/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/software/Python/3.6.4-foss-2018a/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py", line 1006, in forward
    return_dict=return_dict,
  File "/mnt/home/bhatta70/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/software/Python/3.6.4-foss-2018a/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py", line 592, in forward
    output_attentions,
  File "/mnt/home/bhatta70/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/software/Python/3.6.4-foss-2018a/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py", line 477, in forward
    past_key_value=self_attn_past_key_value,
  File "/mnt/home/bhatta70/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/software/Python/3.6.4-foss-2018a/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py", line 409, in forward
    output_attentions,
  File "/mnt/home/bhatta70/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/software/Python/3.6.4-foss-2018a/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py", line 306, in forward
    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 11.17 GiB total capacity; 10.52 GiB already allocated; 190.88 MiB free; 10.56 GiB reserved in total by PyTorch)


In [15]:
tokenized_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 26990
})

In [37]:
from torch.utils.data import Dataset

class RegressionDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)  # Assuming all data entries have labels

    
    

In [44]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 26990
})

In [45]:
input_ids = tokenized_dataset['input_ids']
attention_mask = tokenized_dataset['attention_mask']
labels = dataset['label']  # Assuming your labels are in the original dataset

reg_dataset = RegressionDataset(input_ids, attention_mask, labels)
dataloader = torch.utils.data.DataLoader(reg_dataset, batch_size=16, shuffle=True)


In [46]:
loss_fn = nn.MSELoss()  # Mean Squared Error is common for regression
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
num_epochs = 3

for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch['input_ids'].to('cuda') 
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.squeeze(1), labels) # Ensure outputs are single-dimensional
        loss.backward()
        optimizer.step() 


RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 11.17 GiB total capacity; 10.50 GiB already allocated; 896.00 KiB free; 10.73 GiB reserved in total by PyTorch)