In [1]:
from transformers import (
    BertModel, 
    AutoConfig, 
    AutoTokenizer, 
    Trainer,
    TrainingArguments
    )
import torch.nn as nn
import datasets
import csv


In [2]:
import torch
import numpy as np

torch.cuda.is_available()

True

In [3]:

model_name = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

config.problem_type = 'regression'

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:

# Add linear layer
output_size = 1  


# Combine BERT and the linear layer
class BertWithLinear(nn.Module):
    def __init__(self):
        super(BertWithLinear, self).__init__()
        self.bert = bert_model.to('cuda')
        self.ft = nn.Sequential(
            nn.Linear(config.hidden_size, 1),
            #nn.Dropout(0.4),
            #nn.ReLU(),
            #nn.Linear(128, 16),
            #nn.Dropout(0.2),
            #nn.ReLU(),
            #nn.Linear(16, output_size)
        ).to('cuda')
        

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)
        # Use pooled output for classification/regression
        pooled_output = output.pooler_output
        return self.ft(pooled_output)

model = BertWithLinear().to('cuda')

In [5]:
# freeze BERT pretrained weights
for param in model.bert.embeddings.parameters():
    param.requires_grad = True
model = nn.DataParallel(model)

In [6]:
# Load dataset
feats_fp = open("BERT_X.csv", "r")
labels_fp = open("BERT_y.csv", "r")
feats = csv.reader(feats_fp)
labels = csv.reader(labels_fp)

# skip header
next(feats)
next(labels)

unscaled_data = {'text': [], 'label': []}
for row in feats:
    unscaled_data['text'].append(row[0].strip().replace("\n", " "))
for row in labels:
    unscaled_data['label'].append(float(row[0].strip().replace("\n", "")))
    

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_labels = scaler.fit_transform(np.array(unscaled_data['label']).reshape(-1,1)).flatten()
scaled_data = unscaled_data
scaled_data['label'] = scaled_labels
scaled_labels

array([-0.58786439, -0.94649659,  2.86819603, ..., -1.04441134,
       -0.86218111, -0.69471581])

In [8]:
print(len(scaled_data['text']), len(scaled_data['label']))
assert len(scaled_data['text']) == len(scaled_data['label'])
dataset = datasets.Dataset.from_dict(scaled_data)

26990 26990


In [9]:
print(dataset[0]['text'])
scaler.inverse_transform(np.array([dataset[0]['label']]).reshape(-1,1))

Overview  HearingLife is a national hearing care company and part of the Demant Group, a global leader in hearing healthcare built on a heritage of care, health, and innovation since 1904. HearingLife operates more than 600 hearing care centers across 42 states. We follow a scientific, results-oriented approach to hearing healthcare that is provided by highly skilled and caring professionals. Our vision is to help more people hear better through life-changing hearing health delivered by the best personalized care. This Team Member must uphold the HearingLife Core Values:   We create trust  We are team players  We apply a can-do attitude  We create innovative solutions   Responsibilities  You will help more people hear better by providing clinical expertise to diagnose and treat hearing loss while ensuring a positive patient experience. The Hearing Care Provider acts in accordance with required industry and state professional licensing standards and local practice scope and is responsib

array([[63000.]])

In [10]:
# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess, batched=True)

  0%|          | 0/27 [00:00<?, ?ba/s]

In [11]:
# training_args = TrainingArguments(
#     output_dir='./results',          
#     num_train_epochs=3,              
#     per_device_train_batch_size=16,  
#     learning_rate=5e-5,               
#     warmup_steps=500,                
#     weight_decay=0.01,              
#     logging_dir='./logs',
# )


In [12]:

# trainer = Trainer(
#     model=model,                         
#     args=training_args,                  
#     train_dataset=tokenized_dataset,    
#     eval_dataset=tokenized_dataset,
# )
# trainer.train()

In [13]:
tokenized_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 26990
})

In [14]:
from torch.utils.data import Dataset

class RegressionDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)  # Assuming all data entries have labels

    
    

In [15]:
input_ids = tokenized_dataset['input_ids']
attention_mask = tokenized_dataset['attention_mask']
labels = dataset['label']  # Assuming your labels are in the original dataset

reg_dataset = RegressionDataset(input_ids, attention_mask, labels)
dataloader = torch.utils.data.DataLoader(reg_dataset, batch_size=32, shuffle=True)



In [16]:
model.parameters()

<generator object Module.parameters at 0x2b74e3b3b938>

In [17]:
loss_fn = nn.MSELoss()  # Mean Squared Error is common for regression
optimizer = torch.optim.Adam([
    #{'params': model.module.bert.parameters(), 'lr': 1e-8}, 
    {'params': model.module.ft.parameters() , 'lr' : 2e-7}   # standard lr for our NN
])
num_epochs = 3
    
    
from torch.utils.data import random_split 

train_size = int(0.8 * len(reg_dataset))  # 80% of the dataset for training
val_size = len(reg_dataset) - train_size 

# Create the train and validation datasets
train_dataset, val_dataset = random_split(reg_dataset, [train_size, val_size])

# Create DataLoaders
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)  # No need to shuffle validation

# Modify your training loop (add validation)
for epoch in range(num_epochs):
    print(epoch,"/",num_epochs)
    b=0
    ## Training Phase
    model.train()  # Set model to training mode
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to('cuda') 
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.squeeze(1), labels) # Ensure outputs are single-dimensional
        loss.backward()
        optimizer.step() 
        print(f"batch {b} complete. Loss: ",loss.item())
        b+=1
    ## Validation Phase
    model.eval()   # Set model to evaluation mode
    val_loss = 0
    with torch.no_grad():  # Disable gradient calculation for validation
        for batch in val_dataloader:
            # ... (Similar to training, but no optimizer updates)
            outputs = model(input_ids, attention_mask=attention_mask)
            val_loss += loss_fn(outputs.squeeze(1), labels).item()

    val_loss /= len(val_dataloader)
    print(f"Epoch {epoch} Validation Loss: {val_loss}")


0 / 3
batch 0 complete. Loss:  1.099075436592102
batch 1 complete. Loss:  1.2900373935699463
batch 2 complete. Loss:  1.4214431047439575
batch 3 complete. Loss:  0.8167529106140137
batch 4 complete. Loss:  0.6171175241470337
batch 5 complete. Loss:  0.4970940053462982
batch 6 complete. Loss:  1.1501946449279785
batch 7 complete. Loss:  0.5854419469833374
batch 8 complete. Loss:  1.1572542190551758
batch 9 complete. Loss:  1.4339679479599
batch 10 complete. Loss:  0.6267854571342468
batch 11 complete. Loss:  1.9051165580749512
batch 12 complete. Loss:  0.71701979637146
batch 13 complete. Loss:  1.8193550109863281
batch 14 complete. Loss:  0.679241418838501
batch 15 complete. Loss:  0.441354364156723
batch 16 complete. Loss:  1.0930038690567017
batch 17 complete. Loss:  0.5830692648887634
batch 18 complete. Loss:  0.6543117761611938
batch 19 complete. Loss:  0.7338196039199829
batch 20 complete. Loss:  1.2226979732513428
batch 21 complete. Loss:  1.7564237117767334
batch 22 complete. Los

batch 182 complete. Loss:  1.5144438743591309
batch 183 complete. Loss:  0.7129267454147339
batch 184 complete. Loss:  0.7090712785720825
batch 185 complete. Loss:  1.6209402084350586
batch 186 complete. Loss:  1.02764892578125
batch 187 complete. Loss:  1.2736485004425049
batch 188 complete. Loss:  1.5377906560897827
batch 189 complete. Loss:  1.570300817489624
batch 190 complete. Loss:  0.854711651802063
batch 191 complete. Loss:  1.7643550634384155
batch 192 complete. Loss:  1.2946674823760986
batch 193 complete. Loss:  0.8018882870674133
batch 194 complete. Loss:  0.7617135047912598
batch 195 complete. Loss:  1.0199899673461914
batch 196 complete. Loss:  1.7390470504760742
batch 197 complete. Loss:  1.3999478816986084
batch 198 complete. Loss:  1.5466840267181396
batch 199 complete. Loss:  1.1274441480636597
batch 200 complete. Loss:  0.711712121963501
batch 201 complete. Loss:  1.167978048324585
batch 202 complete. Loss:  0.6663881540298462
batch 203 complete. Loss:  1.16010117530

batch 361 complete. Loss:  0.8608397245407104
batch 362 complete. Loss:  0.9146748781204224
batch 363 complete. Loss:  0.9634088277816772
batch 364 complete. Loss:  1.0435850620269775
batch 365 complete. Loss:  1.4331648349761963
batch 366 complete. Loss:  0.9518179893493652
batch 367 complete. Loss:  0.5402412414550781
batch 368 complete. Loss:  0.8951656222343445
batch 369 complete. Loss:  1.1583139896392822
batch 370 complete. Loss:  0.962187647819519
batch 371 complete. Loss:  0.6636677980422974
batch 372 complete. Loss:  1.2424463033676147
batch 373 complete. Loss:  0.8389501571655273
batch 374 complete. Loss:  1.2857015132904053
batch 375 complete. Loss:  1.9716109037399292
batch 376 complete. Loss:  0.5916537642478943
batch 377 complete. Loss:  1.169728398323059
batch 378 complete. Loss:  0.8507586121559143
batch 379 complete. Loss:  0.8572081923484802
batch 380 complete. Loss:  1.09492826461792
batch 381 complete. Loss:  0.8908212184906006
batch 382 complete. Loss:  1.858963608

batch 540 complete. Loss:  0.9872867465019226
batch 541 complete. Loss:  1.1071951389312744
batch 542 complete. Loss:  1.5569186210632324
batch 543 complete. Loss:  0.851157009601593
batch 544 complete. Loss:  1.3624147176742554
batch 545 complete. Loss:  1.328855037689209
batch 546 complete. Loss:  0.829879641532898
batch 547 complete. Loss:  2.2442495822906494
batch 548 complete. Loss:  1.0683717727661133
batch 549 complete. Loss:  0.9645459055900574
batch 550 complete. Loss:  0.6464013457298279
batch 551 complete. Loss:  1.5293817520141602
batch 552 complete. Loss:  0.9524233341217041
batch 553 complete. Loss:  0.678788423538208
batch 554 complete. Loss:  1.2689518928527832
batch 555 complete. Loss:  0.7038782835006714
batch 556 complete. Loss:  0.8951864838600159
batch 557 complete. Loss:  1.7068393230438232
batch 558 complete. Loss:  0.8082022070884705
batch 559 complete. Loss:  1.0810822248458862
batch 560 complete. Loss:  0.9240525960922241
batch 561 complete. Loss:  0.993599057

batch 44 complete. Loss:  1.2655953168869019
batch 45 complete. Loss:  1.7167201042175293
batch 46 complete. Loss:  1.434765100479126
batch 47 complete. Loss:  1.089825987815857
batch 48 complete. Loss:  0.9550039172172546
batch 49 complete. Loss:  0.4711337089538574
batch 50 complete. Loss:  0.7758818864822388
batch 51 complete. Loss:  0.7911080121994019
batch 52 complete. Loss:  0.7137401103973389
batch 53 complete. Loss:  1.0664961338043213
batch 54 complete. Loss:  1.7073571681976318
batch 55 complete. Loss:  1.3009711503982544
batch 56 complete. Loss:  1.019487977027893
batch 57 complete. Loss:  1.4750361442565918
batch 58 complete. Loss:  1.3681232929229736
batch 59 complete. Loss:  1.004213809967041
batch 60 complete. Loss:  0.5447919368743896
batch 61 complete. Loss:  0.9392204284667969
batch 62 complete. Loss:  0.6543574333190918
batch 63 complete. Loss:  0.46773090958595276
batch 64 complete. Loss:  0.9691888093948364
batch 65 complete. Loss:  1.4743127822875977
batch 66 comp

batch 224 complete. Loss:  0.6160695552825928
batch 225 complete. Loss:  1.387287974357605
batch 226 complete. Loss:  1.1748745441436768
batch 227 complete. Loss:  1.7972233295440674
batch 228 complete. Loss:  0.7216970324516296
batch 229 complete. Loss:  0.5496119260787964
batch 230 complete. Loss:  0.6585575342178345
batch 231 complete. Loss:  0.4826538562774658
batch 232 complete. Loss:  0.9802253246307373
batch 233 complete. Loss:  1.080472707748413
batch 234 complete. Loss:  0.8506709933280945
batch 235 complete. Loss:  0.9108262658119202
batch 236 complete. Loss:  1.2720527648925781
batch 237 complete. Loss:  1.0509010553359985
batch 238 complete. Loss:  1.188075304031372
batch 239 complete. Loss:  0.7434227466583252
batch 240 complete. Loss:  0.9727303981781006
batch 241 complete. Loss:  1.0213758945465088
batch 242 complete. Loss:  0.9787735939025879
batch 243 complete. Loss:  0.7645453214645386
batch 244 complete. Loss:  1.6425782442092896
batch 245 complete. Loss:  1.24001622

batch 403 complete. Loss:  1.2452256679534912
batch 404 complete. Loss:  0.9295735359191895
batch 405 complete. Loss:  0.8503320217132568
batch 406 complete. Loss:  1.2752034664154053
batch 407 complete. Loss:  1.3056433200836182
batch 408 complete. Loss:  0.6918566226959229
batch 409 complete. Loss:  1.1722981929779053
batch 410 complete. Loss:  0.6323802471160889
batch 411 complete. Loss:  1.4468084573745728
batch 412 complete. Loss:  1.754927396774292
batch 413 complete. Loss:  0.6009908318519592
batch 414 complete. Loss:  1.1526063680648804
batch 415 complete. Loss:  0.6669111847877502
batch 416 complete. Loss:  0.6954566240310669
batch 417 complete. Loss:  0.8105722665786743
batch 418 complete. Loss:  1.5805554389953613
batch 419 complete. Loss:  0.6080617904663086
batch 420 complete. Loss:  1.5812839269638062
batch 421 complete. Loss:  0.8391757011413574
batch 422 complete. Loss:  0.7165395021438599
batch 423 complete. Loss:  1.3576202392578125
batch 424 complete. Loss:  1.381992

batch 582 complete. Loss:  0.4509051442146301
batch 583 complete. Loss:  1.485410213470459
batch 584 complete. Loss:  0.8081980347633362
batch 585 complete. Loss:  1.1745946407318115
batch 586 complete. Loss:  1.0398496389389038
batch 587 complete. Loss:  0.6726033687591553
batch 588 complete. Loss:  0.9815458655357361
batch 589 complete. Loss:  1.0018056631088257
batch 590 complete. Loss:  1.4164412021636963
batch 591 complete. Loss:  1.1478009223937988
batch 592 complete. Loss:  0.939282238483429
batch 593 complete. Loss:  0.8280042409896851
batch 594 complete. Loss:  1.9016644954681396
batch 595 complete. Loss:  0.9635010361671448
batch 596 complete. Loss:  0.689430296421051
batch 597 complete. Loss:  1.0519752502441406
batch 598 complete. Loss:  0.8465758562088013
batch 599 complete. Loss:  0.8490771651268005
batch 600 complete. Loss:  1.0352298021316528
batch 601 complete. Loss:  0.7231394648551941
batch 602 complete. Loss:  0.8858510255813599
batch 603 complete. Loss:  0.75611579

KeyboardInterrupt: 

In [None]:
# inference
text = "The Janitor/Maintenance Attendant plays an integral part in helping us keep our clubs looking shiny and new. Working with Club Leadership, the Janitor works to ensure that the club facility is kept clean, safe and sanitary."
inputs = tokenizer(text, return_tensors='pt').to('cuda')
outputs = model(**inputs)
scaler.inverse_transform(np.array([outputs.item()]))

In [18]:
# del model
# torch.cuda.empty_cache()