# Install and load packages

In [None]:
!pip install -q transformers datasets wandb accelerate kaleido psutil gputil

In [None]:
import time
from datetime import datetime
import re, os
from pathlib import Path
import torch, wandb
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification,Trainer,TrainingArguments, AutoConfig, EarlyStoppingCallback, IntervalStrategy
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, classification_report, f1_score, precision_recall_fscore_support
from google.colab import drive
import plotly.express as px
import psutil
import GPUtil

# Create variables and functions

In [None]:
NSAMPLE = 1 # 1 to run with the entire dataset
RANDOM_SEED = 123
TRESHOLD_PSEUDOLABELS = 0.6
os.environ["WANDB_API_KEY"] = ''
pretrainedmodel = "neuralmind/bert-large-portuguese-cased"

In [None]:
mydrive = Path('drive/MyDrive')
data_folder = mydrive / 'thesis-data'
bert_folder = mydrive / 'bertimbau-selftrain/'

In [None]:
def set_random_seed(seed=RANDOM_SEED):
    """
    Function to set the random seed for numpy and PyTorch to ensure reproducibility.

    Parameters:
    seed (int): The seed value. Default is 42.

    Returns:
    None
    """

    # Set the seed for Numpy
    np.random.seed(seed)

    # Set the seed for PyTorch
    torch.manual_seed(seed)

    # If you're running on GPU, you also need to set the seed for the GPU:
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def prepare_test_dataset(test, cols, tokenize_function):
    """
    Function to prepare a test dataset.

    Parameters:
    test (DataFrame): The test DataFrame.
    cols (list): List of column names to include in the test DataFrame.
    tokenize_function (function): The function to tokenize the text data.

    Returns:
    Dataset: The prepared test dataset.
    """

    # Create a copy of the required columns
    test = test[cols].copy()

    # Convert 'text' column to string type
    test['text'] = test['text'].astype(str)

    # Convert the DataFrame to a Dataset
    test_dataset = Dataset.from_pandas(test)

    # Apply the tokenization function to the 'text' column
    test = test_dataset.map(tokenize_function, batched=True)

    return test


def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=TOKEN_MAX_LEN)

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def predict(model,sentence, return_probabilities=False):
    """
    Function to predict class label or probabilities for a given sentence.

    Parameters:
    sentence (str): Input sentence for prediction.
    return_probabilities (bool): Flag to return probabilities. If False, the function
                                 returns the label of the class with the highest score.
                                 If True, the function returns the probabilities for each class.
                                 Default is False.

    Returns:
    int or list: Predicted class label (if return_probabilities=False) or list of
                 probabilities for each class (if return_probabilities=True).
    """

    # Tokenize the sentence and prepare it for input to the model
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=TOKEN_MAX_LEN)

    # Move input tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        # Feed the inputs to the model and get the outputs
        outputs = model(**inputs)

        # Access the logits from the SequenceClassifierOutput object
        logits = outputs.logits

    if return_probabilities:
        # Calculate the probabilities for each class
        probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy().flatten()
        return probabilities
    else:
        # Get the label of the class with the highest prediction score
        predicted_label = torch.argmax(logits, dim=-1).item()
        return predicted_label

def get_samples(df, column, label, N=10,threshold = TRESHOLD_PSEUDOLABELS):
    """
    Extracts a specified number of samples (rows) from each quantile of a DataFrame based on a provided column and label.

    This function assumes the input DataFrame contains a column with prediction confidence scores. It filters out rows
    with scores below a certain threshold, and then categorizes the remaining rows into four quantiles based on these
    scores. From each quantile, it randomly selects a set number of rows (samples) and returns them in a new DataFrame,
    alongside the original text and a provided label.

    Parameters:
    df (pd.DataFrame): The input DataFrame from which to extract samples.
    column (str): The column in df that contains the prediction confidence scores.
                  Rows with scores above a global 'threshold' are considered for sampling.
    label (str): The label to assign to each of the extracted samples in the returned DataFrame.
    N (int): The number of samples to extract from each quantile. If a quantile contains fewer than N samples,
                       all samples from that quantile are included. Default is 10.
    threshold(float): Minimum confidence needed to sample.

    Returns:
    samples_df (pd.DataFrame): A new DataFrame containing the extracted samples. This DataFrame has two columns:
                               'text' - The original text from the input DataFrame
                               'label' - The provided label

    Notes:
    This function uses pandas' qcut function to divide the 'column' into 4 equal quantiles.

    """


    # Categorize the remaining rows into four quantiles based on the column
    df['quantile'] = pd.qcut(df[column], q=4)

    # Randomly sample N rows from each quantile
    samples = df.groupby('quantile', group_keys=False).apply(lambda x: x.sample(N) if len(x) > N else x)

    # Create a new DataFrame from the samples and assign the provided label to each sample
    samples_df = pd.DataFrame(samples['text'])
    samples_df['label'] = label

    return samples_df


def print_system_info():
    print(f'Physical cores: {psutil.cpu_count(logical=False)}')
    print(f'Total cores: {psutil.cpu_count(logical=True)}')
    print(f'Memory: {psutil.virtual_memory().total / (1024.0 **3)}GB')

    GPUs = GPUtil.getGPUs()
    for i, gpu in enumerate(GPUs):
        print(f'GPU {i}: {gpu.name} with {gpu.memoryTotal}MB memory')



In [None]:
def train_model(
    project_name,
    run_name,
    train_dataset=None,
    eval_dataset=None,
    pretrained_model=pretrainedmodel,
    num_labels=2,
    output_dir=bert_folder,
    eval_steps=1_500,
    warmup_steps=1_000,
    learning_rate=2e-5,
    dropout_rate=0.1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    seed=RANDOM_SEED,
    compute_metrics=None,
    patience=3):

    start_time = time.time()

    wandb.init(project=project_name, name=run_name, settings=wandb.Settings(start_method="thread"))

    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=num_labels,
                                                                hidden_dropout_prob=dropout_rate,
                                                                attention_probs_dropout_prob=dropout_rate)

    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="steps",
        eval_steps=eval_steps,
        save_steps=eval_steps,
        warmup_steps=warmup_steps,
        logging_steps=10,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        seed=seed,
        save_total_limit=2,
        report_to="wandb",
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        optim="adamw_torch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )

    trainer.train()

    end_time = time.time()
    print(f'Training took {end_time - start_time} seconds')

    wandb.finish()

    return trainer


In [None]:
def evaluate_model(trainer, test_dataset):
    """
    Function to evaluate a model on a test dataset and print relevant metrics.

    Parameters:
    trainer (Trainer): The Trainer object for the model.
    test_dataset (Dataset): The test dataset.

    Returns:
    None.
    """

    # Get the predictions from the Trainer object
    predictions = trainer.predict(test_dataset).predictions

    # Get the true labels from the test dataset
    true_labels = test_dataset['label']

    # Calculate the predicted labels
    predicted_labels = np.argmax(predictions, axis=-1)

    # Generate and print the classification report
    print(classification_report(true_labels, predicted_labels, target_names=["Class 0", "Class 1"]))

    # Calculate precision, recall, f1_score, and support
    precision, recall, f1_score, support = precision_recall_fscore_support(true_labels, predicted_labels)

    # Print the F1-score for Class 1
    class1_f1 = f1_score[1]  # Index 1 corresponds to Class 1
    print("F1-score for Class 1:", class1_f1)

    # Print the mean F1-score
    print("General F1:", np.mean(f1_score))

    # Print the mean recall
    print("Recall:", np.mean(recall))



# Setup

In [None]:
drive.mount('/content/drive')

tqdm.pandas(desc="Running...")

set_random_seed(42)

# Hardware diagnostic

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
!lscpu

In [None]:
print_system_info()

# Load data

In [None]:
trainfile = data_folder / 'train' / 'train.csv'
testfile = data_folder / 'test' / 'test_label.csv'
testrawfile = data_folder / 'test' / 'test_raw.csv'
unlabel_file = data_folder / 'train' / 'unlabel.csv'

cols = ['text','label']

In [None]:
train = pd.read_csv(trainfile).sample(frac=NSAMPLE,random_state=RANDOM_SEED).set_index('id')
test = pd.read_csv(testfile).sample(frac=NSAMPLE,random_state=RANDOM_SEED).set_index('id')
testraw = pd.read_csv(testrawfile).sample(frac=NSAMPLE,random_state=RANDOM_SEED).set_index('id')

print(train.shape)
print(test.shape)
print(testraw.shape)


# Define token lenght

In [None]:
TOKEN_MAX_LEN = 512
tokenizer = AutoTokenizer.from_pretrained(pretrainedmodel,do_lower_case=False)
train_full_token = Dataset.from_pandas(train).map(tokenize_function, batched=True)


In [None]:
hist_series = pd.Series([len([v for v in seq if v != 0]) for seq in train_full_token['input_ids']])

fig = px.histogram(hist_series, template='presentation')
fig.update_layout(title='',
                  xaxis_title='Token length',
                  yaxis_title='Frequency',
                  showlegend=False
                  )

fig.update_xaxes(showline=True,
         linewidth=1,
         linecolor='black',
         mirror=True)

fig.update_yaxes(showline=True,
         linewidth=1,
         linecolor='black',
         mirror=True)
fig.show()
fig.write_image("tokenlen.png", width=1200, height=800,scale=2)

# Prepare dataset

In [None]:
TOKEN_MAX_LEN = 128
tokenizer = AutoTokenizer.from_pretrained(pretrainedmodel,do_lower_case=False)
train_full_token = Dataset.from_pandas(train).map(tokenize_function, batched=True)


In [None]:
train = Dataset.from_pandas(train).map(tokenize_function, batched=True)

test = prepare_test_dataset(test,cols,tokenize_function)
testraw = prepare_test_dataset(testraw,cols,tokenize_function)

train_split = train.train_test_split(test_size=0.1)

train_dataset = train_split['train']
val_dataset = train_split['test']

# Run model

In [None]:
trainer = train_model(
    project_name="myproject",
    run_name="first-train",
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
# 323 computers units -15h10
# end 307 computer units- 18h16

# Evaluate model

In [None]:
evaluate_model(trainer, testraw)

In [None]:
evaluate_model(trainer, test)

In [None]:
sample_sentence = "favela"

predicted_label = predict(trainer.model,sample_sentence)
print(f"Predicted label: {predicted_label}")


# Save first model

In [None]:
trainer.model.save_pretrained(bert_folder)
tokenizer.save_pretrained(bert_folder)

# Generate pseudolabels


In [None]:
unlabel = pd.read_csv(unlabel_file).set_index('id')

unlabel = unlabel.sample(frac=NSAMPLE)

print(unlabel.shape)

In [None]:
unlabel['probas'] = unlabel['text'].progress_apply(lambda x: predict(trainer.model, x,return_probabilities=True))

In [None]:
pseudolabels = unlabel.join(pd.DataFrame(unlabel['probas'].to_list(),index=unlabel.index,columns=['class_0','class_1']))

pseudolabels = pseudolabels[(pseudolabels.class_0 > TRESHOLD_PSEUDOLABELS) | (pseudolabels.class_1 > TRESHOLD_PSEUDOLABELS)].copy()

pseudolabels['label'] = pseudolabels['probas'].apply(lambda x: np.argmax(x))


In [None]:
pseudolabels.class_1.hist()

In [None]:
# Get samples for each class
sample_neg = get_samples(pseudolabels, 'class_0', 0)
sample_pos = get_samples(pseudolabels, 'class_1', 1)

# Concatenate samples
sample = pd.concat([sample_pos, sample_neg]).reset_index(drop=True)

sample


In [None]:
sample.to_csv(data_folder / 'pseudolabels_to_validate.csv')

# Merge dataset

In [None]:
pseudolabels.info()

In [None]:
# Get boolean mask where each row is True if 'text' value is not in df['text']
mask = ~pseudolabels['text'].isin(train['text'])

# Use this mask to filter pseudolabels
pseudolabels_filtered = pseudolabels[mask].drop_duplicates('text')

# pseudolabels_filtered


In [None]:
pseudo_df = Dataset.from_pandas(pseudolabels_filtered[cols]).map(tokenize_function, batched=True)

In [None]:
augmented_dataset = concatenate_datasets([train, pseudo_df]).train_test_split(test_size=0.1)
train_dataset_2 = augmented_dataset['train']
val_dataset_2 = augmented_dataset['test']

train_dataset_2

In [None]:
augmented_dataset.save_to_disk(data_folder/'augmented_training')

# Retrain and evaluate

In [None]:
from numba import cuda

device = cuda.get_current_device()
device.reset()

In [None]:
trainer = train_model(
    project_name="myproject",
    run_name="selftrain-retrain",
    train_dataset=train_dataset_2,
    eval_dataset=val_dataset_2,
    compute_metrics=compute_metrics
)


# Evaluate final model

In [None]:
evaluate_model(self_trainer, testraw)

In [None]:
evaluate_model(self_trainer, test)

# Export final model and dataset

In [None]:
self_trainer.model.save_pretrained(bert_folder/'final')
tokenizer.save_pretrained(bert_folder/'final')
augmented_dataset.save_to_disk(bert_folder/'final'/'data')