In [None]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
os.environ["WANDB_DISABLED"] = "true"

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

class Model:
    def __init__(self, num_labels, label, epoch1, epoch2, batch_size, drop):
        # Load the pretrained BERT tokenizer and model for sequence classification
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)  # Set num_labels based on your dataset
        self.drop = drop
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
        self.model.train()
        
        train, test = self.split_df(df, self.drop)
        self.training_args = TrainingArguments(
                                output_dir='./results',          # output directory
                                eval_strategy="epoch",     # evaluate at each epoch
                                per_device_train_batch_size=batch_size,   # batch size for training
                                per_device_eval_batch_size=batch_size,    # batch size for evaluation
                                num_train_epochs=epoch1,              # number of training epochs
                                logging_dir='./logs',            # directory for storing logs
                                logging_steps=10,
                                save_strategy="epoch",
                                save_total_limit=1,
                                report_to=[],
                                push_to_hub=False,               # Avoid pushing the model to Hugging Face Hub
                                load_best_model_at_end=True,
                                weight_decay=0.01,
                            )
        self.trainer = Trainer(
                            model=self.model,                         # the instantiated 🤗 Transformers model
                            args=self.training_args,                  # training arguments
                            train_dataset=train,         # training dataset
                            eval_dataset=test,            # evaluation dataset
                            compute_metrics=compute_metrics,  # Pass the compute_metrics function
                            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
                        )
        
        self.train(label)
        
        self.training_args.num_train_epochs = epoch2
        
    def split_df(self, df, drop):
        # Convert DataFrame to Hugging Face Dataset
        dataset = Dataset.from_pandas(df)

        # Optionally, split into train/test
        train_test = dataset.train_test_split(test_size=drop)
        train_dataset = train_test['train']
        test_dataset = train_test['test']

        # Tokenize the dataset
        def tokenize_function(examples):
            return self.tokenizer(examples['Text'], padding='max_length', truncation=True)

        train_dataset = train_dataset.map(tokenize_function, batched=True)
        test_dataset = test_dataset.map(tokenize_function, batched=True)

        # Rename 'Label' to 'labels' for the trainer to recognize it
        train_dataset = train_dataset.map(lambda examples: {'labels': examples['Label']})
        test_dataset = test_dataset.map(lambda examples: {'labels': examples['Label']})

        # Remove unnecessary columns
        train_dataset = train_dataset.remove_columns(['Text', 'Label'])
        test_dataset = test_dataset.remove_columns(['Text', 'Label'])

        # Set format for PyTorch
        train_dataset.set_format('torch')
        test_dataset.set_format('torch')
        
        return train_dataset, test_dataset

    def train(self, df):
        self.model.train()
        train, test = self.split_df(df, self.drop)
        self.trainer.train_dataset = train
        self.trainer.eval_dataset = test
        
        self.trainer.train()
    
    def predict_labels(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Predict labels for the given DataFrame containing a 'Text' column.

        Args:
            df (pd.DataFrame): Input DataFrame with a 'Text' column.

        Returns:
            pd.DataFrame: DataFrame containing 'Text' and predicted 'Label'.
        """
        # Check if 'Text' column exists
        if 'Text' not in df.columns:
            raise ValueError("Input DataFrame must contain a 'Text' column.")

        # Tokenize the Text column
        inputs = self.tokenizer(df['Text'].tolist(), padding=True, truncation=True, return_tensors="pt")

        # Move input tensors to the same device as the model
        inputs = {key: value.to(self.device) for key, value in inputs.items()}

        # Set the model to evaluation mode
        self.model.eval()

        # Make predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits

        # Get the predicted labels
        predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()  # Move predicted labels back to CPU

        # Create a new DataFrame with Text and predicted Label
        result_df = pd.DataFrame({
            'Text': df['Text'],
            'Label': predicted_labels
        })

        return result_df

In [None]:
def split_dfs(dfs):
    # Create a DataFrame to hold the results
    result_df = pd.DataFrame({
        'Text': dfs[0]['Text'],  # Assuming all DataFrames have the same 'Text' column
    })
    
    # Collect labels from all DataFrames for each text
    for i in range(len(dfs)):
        result_df[f'Label_{i}'] = dfs[i]['Label']
    
    # Identify rows with the same labels across all columns
    same_label_mask = result_df.loc[:, result_df.columns.str.startswith('Label_')].nunique(axis=1) == 1
    same_label_df = result_df[same_label_mask][['Text'] + result_df.columns.tolist()[1:]].copy()
    same_label_df["Label"] = same_label_df["Label_0"]
    
    # Identify rows with different labels
    different_label_df = result_df[~same_label_mask][['Text'] + result_df.columns.tolist()[1:]].copy()
    
    return same_label_df[['Text', 'Label']].reset_index(drop=True), different_label_df[["Text"]].reset_index(drop=True)

In [None]:
import pandas as pd
import time

def autoLabel(label, unlabel, batch_size=0, max_time = 1, epoch1 = 10, epoch2 = 2, num_labels = 2, model_batch_size = 32, num_models = 1, drop=0.2):
    '''
    Input format:
    label -> dataframe [columns: Text (sentence), Label (0, 1)]
    unlabel -> dataframe [columns: Text (sentence)]
    batch_size -> (int) number of rows in one batch
    max_time -> (int in hours) After max_time has passed function returns the label and unlabel for checkpoint (function terminates)
    epoch1 -> (int) number of epoch model should train initially
    epoch2 -> (int) number of epoch model should train for each batch
    ''' 
    
    
    models = [Model(num_labels=num_labels,
                    label=label,
                    epoch1=epoch1, 
                    epoch2=epoch2,
                    batch_size=model_batch_size,
                    drop=drop) 
              for i in range(num_models)]
    
    print("Model initialized")
    
    if batch_size == 0:
        batch_size = len(label)
        
    start = time.time()
    max_time *= 3600
    
    unlabel = unlabel.sample(frac = 1).reset_index(drop=True)
    batches = [unlabel[i:i+batch_size] for i in range(0, len(unlabel), batch_size)]
    
    unlabel = [pd.DataFrame(columns=['Text', 'Label'])]
    
    while(len(batches) and ((time.time() - start) < max_time)):
        batch = batches.pop()
        predicted_label = [model.predict_labels(batch) for model in models]
        correct_prediction, incorrect_prediction = split_dfs(predicted_label)
    
        label = pd.concat([label, correct_prediction])
        unlabel.append(incorrect_prediction)
        
        for model in models:
            model.train(df)
        
        print(f"Remaining batches: {len(batches)}, Labeled size: {len(label)}, New labels added: {len(correct_prediction)}")
        print(f'Time left: {max_time - (time.time() - start)}')
        
    if len(batches):
        return label.reset_index(drop=True), pd.concat(batches + unlabel).reset_index(drop = True)
    else:
        return label.reset_index(drop=True), pd.concat(unlabel).reset_index(drop = True)