In [19]:
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Carregar o dataset a partir do caminho correto
data = pd.read_csv('/content/bot_detection_data.csv')

# Inicializar o tokenizer do BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Definir o comprimento máximo para truncamento e padding
max_length = 128

# Tokenizar os tweets
X = data['Tweet'].tolist()
X_tokenized = tokenizer.batch_encode_plus(
    X,
    padding='max_length',
    max_length=max_length,
    truncation=True,
    return_tensors='tf'  # Assegurar que os tensores são do TensorFlow
)

# Extrair os input_ids e as attention masks dos dados tokenizados e converter para NumPy
input_ids = X_tokenized['input_ids'].numpy()
attention_masks = X_tokenized['attention_mask'].numpy()

# Converter os rótulos (Bot Label)
y = data['Bot Label'].values

# Agora podemos dividir os dados em treino (80%) e validação (20%)
X_train_ids, X_val_ids, y_train, y_val, mask_train, mask_val = train_test_split(
    input_ids, y, attention_masks, test_size=0.2, random_state=42
)

# Carregar o modelo BERT específico para classificação
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Definir o otimizador e a função de perda
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Preparar o dataset de treinamento e validação
train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': X_train_ids, 'attention_mask': mask_train}, y_train)).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': X_val_ids, 'attention_mask': mask_val}, y_val)).batch(16)

# Função de treinamento
@tf.function
def train_step(batch_inputs, batch_labels):
    with tf.GradientTape() as tape:
        logits = model(batch_inputs, training=True).logits
        loss = loss_fn(batch_labels, logits)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Treinamento manual
epochs = 3  # Número de épocas
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    total_loss = 0
    for step, (batch_inputs, batch_labels) in enumerate(train_dataset):
        loss = train_step(batch_inputs, batch_labels)
        total_loss += loss
        if step % 10 == 0:
            print(f'Step {step}, Loss: {loss.numpy()}')
    print(f'Epoch {epoch + 1} Loss: {total_loss.numpy()}')

# Avaliação
def evaluate(model, val_dataset):
    total_loss = 0
    correct_predictions = 0
    total_examples = 0

    for batch_inputs, batch_labels in val_dataset:
        logits = model(batch_inputs, training=False).logits
        loss = loss_fn(batch_labels, logits)
        total_loss += loss
        predictions = tf.argmax(logits, axis=-1)
        correct_predictions += tf.reduce_sum(tf.cast(predictions == batch_labels, tf.float32))
        total_examples += len(batch_labels)

    accuracy = correct_predictions / total_examples
    avg_loss = total_loss / total_examples
    return avg_loss, accuracy

# Avaliar o modelo
val_loss, val_accuracy = evaluate(model, val_dataset)
print(f'Validation Loss: {val_loss.numpy()}, Validation Accuracy: {val_accuracy.numpy()}')


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Step 0, Loss: 0.5944465398788452
Step 10, Loss: 0.662421464920044
Step 20, Loss: 0.7089549899101257
Step 30, Loss: 0.7178081274032593
Step 40, Loss: 0.6985259056091309
Step 50, Loss: 0.6946982145309448
Step 60, Loss: 0.6829687356948853
Step 70, Loss: 0.7216099500656128
Step 80, Loss: 0.7028659582138062
Step 90, Loss: 0.6961725354194641
Step 100, Loss: 0.6712589263916016
Step 110, Loss: 0.6987200975418091
Step 120, Loss: 0.6617199182510376
Step 130, Loss: 0.6693432927131653


KeyboardInterrupt: 