In [19]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from torch.utils.data import Dataset
from torch import nn

# Define a custom BERT model for regression


class BertForRegression(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        # Single output for regression
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]  # Pooled output from BERT
        regression_output = self.regressor(pooled_output)
        # Ensure the output is of shape [batch_size]
        return regression_output.squeeze(-1)

# Define the custom dataset class


class ExplanationDataset(Dataset):
    def __init__(self, dataframe):
        self.labels = dataframe['confidence_score'].values
        self.texts = dataframe['explanation'].values
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], truncation=True, padding='max_length', max_length=512)
        item = {key: torch.tensor(val) for key, val in encoding.items()}
        # Regression requires float labels
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Function to train the model


def train_model(df, model_name):
    # Split the dataset into training and validation sets
    train_size = int(0.8 * len(df))
    val_size = len(df) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(
        ExplanationDataset(df), [train_size, val_size])

    # Load the custom BERT model for regression
    model = BertForRegression()

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs_{model_name}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    torch.save(model.state_dict(), f'bert-finetuned-{model_name}.pt')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer.save_pretrained(f'bert-finetuned-{model_name}')


# Load the dataset from the provided CSV file
df = pd.read_csv('fuck_you.csv')

# Train the model for expert explanations with high confidence
train_model(df, 'expert_high_confidence')



KeyboardInterrupt: 

In [3]:
# Example of loading the trained model
model = BertForRegression()
model.load_state_dict(torch.load('bert-finetuned-expert_high_confidence.pt'))

  model.load_state_dict(torch.load('bert-finetuned-expert_high_confidence.pt'))


<All keys matched successfully>

In [18]:
import torch
from transformers import BertTokenizer
from sklearn.preprocessing import MinMaxScaler
from torch import nn
import pandas as pd

# Load the dataset to fit the scaler
df = pd.read_csv('fuck_you.csv')

# Fit the scaler with the training confidence scores
scaler = MinMaxScaler()
scaler.fit(df[['confidence_score']])

# Define the custom BERT model for regression (same as used during training)
class BertForRegression(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        # Single output for regression
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        regression_output = self.regressor(pooled_output)
        return regression_output.squeeze(-1)  # Output is [batch_size]

# Load the fine-tuned BERT model and tokenizer
model = BertForRegression()
model.load_state_dict(torch.load('bert-finetuned-expert_high_confidence.pt'))
tokenizer = BertTokenizer.from_pretrained('bert-finetuned-expert_high_confidence')

def get_confidence_score(sentence):
    model.eval()
    inputs = tokenizer(sentence, return_tensors="pt",
                       truncation=True, padding='max_length', max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    confidence_score = outputs.item()
    # Inverse transform to get the original scale
    confidence_score = scaler.inverse_transform([[confidence_score]])[0][0]
    return confidence_score

# Example sentence
sentence = "Strategically, the utility function \( U_u(\omega_t^o) \) must be evaluated against the predefined quantile thresholds to ensure the offer remains within acceptable limits."
confidence_score = get_confidence_score(sentence)
print(f"Confidence Score: {confidence_score}")


  model.load_state_dict(torch.load('bert-finetuned-expert_high_confidence.pt'))


OSError: bert-finetuned-expert_high_confidence is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [14]:
confidence_score = scaler.inverse_transform([[confidence_score]])[0][0]


In [15]:
confidence_score

0.7715410828590393