In [7]:
%pip install transformers torch pandas scikit-learn tf-keras accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd

file_path = "training.1600000.processed.noemoticon.csv"


df = pd.read_csv(file_path, 
                    encoding='latin-1',
                    header=None,
                    delimiter=',',
                    on_bad_lines='skip')  


df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']


print(df.head())


   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [9]:
import re 

df['label'] = df['target'].map({0: 0, 2: 1, 4: 2})


def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|\#', '', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    return text.strip().lower()  


df['text_clean'] = df['text'].apply(clean_text)


print("\nDistribuição das classes:")
print(df['label'].value_counts())


Distribuição das classes:
label
0    800000
2    800000
Name: count, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split
import pandas as pd


train_df, test_df = train_test_split(df,
                                    test_size=500,
                                    train_size=5000,
                                    random_state=42,
                                    stratify=df['label'])


train_df.to_csv('train_dataset.csv', index=False, encoding='utf-8')
test_df.to_csv('test_dataset.csv', index=False, encoding='utf-8')

print(f"Tamanho do conjunto de treino: {len(train_df)}")
print(f"Tamanho do conjunto de teste: {len(test_df)}")
print("\nArquivos salvos com sucesso:")


Tamanho do conjunto de treino: 5000
Tamanho do conjunto de teste: 500

Arquivos salvos com sucesso:


In [11]:
train_df = pd.read_csv('train_dataset.csv', encoding='utf-8')
test_df = pd.read_csv('test_dataset.csv', encoding='utf-8')


print("Exemplo do conjunto de treino:")
print(train_df.head())
print("\nExemplo do conjunto de teste:")
print(test_df.head())

Exemplo do conjunto de treino:
   target         ids                          date      flag        user  \
0       4  1970090650  Sat May 30 02:24:58 PDT 2009  NO_QUERY        9330   
1       0  2251505002  Sat Jun 20 03:43:20 PDT 2009  NO_QUERY    GeemaPee   
2       4  2045755369  Fri Jun 05 11:11:19 PDT 2009  NO_QUERY    bustyb73   
3       0  2185330476  Mon Jun 15 17:13:15 PDT 2009  NO_QUERY  kimberliea   
4       0  2186502804  Mon Jun 15 18:56:01 PDT 2009  NO_QUERY    anggelai   

                                                text  label  \
0  I got shutter shades yesterday and I'm still i...      2   
1          @ChelseaGA Oh aye, i mind you saying now       0   
2  @harmony341 yeah went today and had blood done...      2   
3  Had a customer tell me that I was &quot;unplea...      0   
4    : my stomach aches..  http://plurk.com/p/117ner      0   

                                          text_clean  
0  i got shutter shades yesterday and im still in...  
1                

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW 
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Configurações
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 16
epochs = 3
learning_rate = 5e-5

# Inicializar o tokenizer e o modelo
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

# Dataset personalizado
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Criar datasets e dataloaders
train_dataset = TweetDataset(
    train_df['text_clean'].values,
    train_df['label'].values,
    tokenizer
)

test_dataset = TweetDataset(
    test_df['text_clean'].values,
    test_df['label'].values,
    tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Otimizador
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Função de treinamento
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(data_loader, desc="Treinando"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(data_loader), accuracy

# Função de avaliação
def evaluate(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Avaliando"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct / total
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return accuracy, f1

# Loop de treinamento
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
    print(f"Loss de Treinamento: {train_loss:.4f}, Acurácia de Treinamento: {train_acc:.4f}")

    val_acc, val_f1 = evaluate(model, test_loader, device)
    print(f"Acurácia de Validação: {val_acc:.4f}, F1-Score de Validação: {val_f1:.4f}")

# Salvar o modelo e o tokenizer
model.save_pretrained('./bert_sentiment_model')
tokenizer.save_pretrained('./bert_sentiment_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


  return forward_call(*args, **kwargs)
Treinando: 100%|██████████| 313/313 [38:08<00:00,  7.31s/it]


Loss de Treinamento: 0.5526, Acurácia de Treinamento: 0.7280


Avaliando: 100%|██████████| 32/32 [01:06<00:00,  2.07s/it]


Acurácia de Validação: 0.7740, F1-Score de Validação: 0.7704

Epoch 2/3


  return forward_call(*args, **kwargs)
Treinando: 100%|██████████| 313/313 [38:45<00:00,  7.43s/it]


Loss de Treinamento: 0.3340, Acurácia de Treinamento: 0.8624


Avaliando: 100%|██████████| 32/32 [01:06<00:00,  2.07s/it]


Acurácia de Validação: 0.7800, F1-Score de Validação: 0.7800

Epoch 3/3


  return forward_call(*args, **kwargs)
Treinando: 100%|██████████| 313/313 [36:42<00:00,  7.04s/it]


Loss de Treinamento: 0.1624, Acurácia de Treinamento: 0.9406


Avaliando: 100%|██████████| 32/32 [00:58<00:00,  1.82s/it]


Acurácia de Validação: 0.7720, F1-Score de Validação: 0.7714


('./bert_sentiment_model/tokenizer_config.json',
 './bert_sentiment_model/special_tokens_map.json',
 './bert_sentiment_model/vocab.txt',
 './bert_sentiment_model/added_tokens.json')

: 