In [1]:
from transformers import (
    BertModel, 
    AutoConfig, 
    AutoTokenizer, 
    Trainer,
    TrainingArguments
    )
import torch.nn as nn
import datasets
import csv
import accelerate

In [2]:
import torch

torch.cuda.is_available()

True

In [3]:

model_name = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)


In [4]:

# Add linear layer
output_size = 1  


# Combine BERT and the linear layer
class BertWithLinear(nn.Module):
    def __init__(self):
        super(BertWithLinear, self).__init__()
        self.bert = bert_model
        self.ft = nn.Sequential(
            nn.Linear(config.hidden_size, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, output_size)


        )
        

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)
        # Use pooled output for classification/regression
        pooled_output = output.pooler_output
        return self.ft(pooled_output)

model = BertWithLinear()

In [5]:
# freeze BERT pretrained weights
for param in model.bert.embeddings.parameters():
    param.requires_grad = False

In [6]:
# Load dataset
feats_fp = open("BERT_X.csv", "r")
labels_fp = open("BERT_y.csv", "r")
feats = csv.reader(feats_fp)
labels = csv.reader(labels_fp)

# skip header
next(feats)
next(labels)

data = {'text': [], 'label': []}
for row in feats:
    data['text'].append(row[0].strip().replace("\n", " "))
for row in labels:
    data['label'].append(float(row[0].strip().replace("\n", "")))
print(len(data['text']), len(data['label']))
assert len(data['text']) == len(data['label'])
dataset = datasets.Dataset.from_dict(data)


26990 26990


In [7]:
print(dataset[0]['text'])
print(dataset[0]['label'])

Overview  HearingLife is a national hearing care company and part of the Demant Group, a global leader in hearing healthcare built on a heritage of care, health, and innovation since 1904. HearingLife operates more than 600 hearing care centers across 42 states. We follow a scientific, results-oriented approach to hearing healthcare that is provided by highly skilled and caring professionals. Our vision is to help more people hear better through life-changing hearing health delivered by the best personalized care. This Team Member must uphold the HearingLife Core Values:   We create trust  We are team players  We apply a can-do attitude  We create innovative solutions   Responsibilities  You will help more people hear better by providing clinical expertise to diagnose and treat hearing loss while ensuring a positive patient experience. The Hearing Care Provider acts in accordance with required industry and state professional licensing standards and local practice scope and is responsib

In [8]:
# max salary

max_salary = max(data['label'])
min_salary = min(data['label'])
print(max_salary, min_salary)


285000.0 22000.0


In [9]:
# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/26990 [00:00<?, ? examples/s]

In [10]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    learning_rate=5e-5,               
    warmup_steps=500,                
    weight_decay=0.01,   
    gradient_accumulation_steps=4,            
    logging_dir='./logs',            
)


In [11]:

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_dataset,    
    eval_dataset=tokenized_dataset,
)
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [21]:
# predict
model.eval()
input_text = "I am a software engineer"
input_text = tokenizer(input_text, return_tensors="pt")
output = model(**input_text)
print(output)
print(torch.sigmoid(output.logits))


Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 26990
})