In [55]:
#!pip install kagglehub
#!pip install pandas
#!pip install nltk
#!pip install transformers[torch] 
#!pip install emoji
#!pip install datasets 
#!pip install scikit-learn 
#!pip install evaluate
#!pip install py-cpuinfo gputil psutil
#!pip install svgling
#!pip install benepar

In [2]:
# Imports Dataset Kaggle
import kagglehub
# Download latest version
path = kagglehub.dataset_download("mariumfaheem666/spam-sms-classification-using-nlp")+"/"+"Spam_SMS.csv"

In [3]:
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import emoji
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


# Definição de Classes
class Mensagem:
    def __init__(self, mensagem_original, classificacao):
        self.MensagemOriginal = mensagem_original
        self.Classificacao = classificacao
        self.ClassificacaoInt = 1 if classificacao == 'spam' else 0

        #variável temporária para tratamentos
        text = self.MensagemOriginal
        text = emoji.demojize(text, language='en')
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = text.encode("utf-8", "ignore").decode()  
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'\s+', ' ', text)
        sentences = sent_tokenize(text)
        cleaned_sentences = [s.strip() for s in sentences]
        
        # Msg Tratada (limpa)
        self.MensagemTratada = ' '.join(cleaned_sentences)
        
        self.MensagemTokenizada = word_tokenize(self.MensagemTratada)
        self.QuantidadeCaracteres = len(self.MensagemTratada)
        self.QuantidadePalavras = len(self.MensagemTokenizada)

        tagged = pos_tag(self.MensagemTokenizada)
        verbs = [word for word, tag in tagged if tag.startswith('VB')]
        self.QuantidadeVerbos = len(verbs)
        nouns = [word for word, tag in tagged if tag.startswith('NN')]
        self.QuantidadeSubstantivos = len(nouns)

        
class BaseMensagens:
    def __init__(self, listamsgs):
        self.BaseMensagens = []
        for index, row in listamsgs.iterrows():
            self.BaseMensagens.append(Mensagem(row['Message'],row['Class']))
        self.BaseMensagensDataFrame = pd.DataFrame([{'text': p.MensagemTratada, 'target': p.ClassificacaoInt} for p in self.BaseMensagens])

    def Exec(self, percent):
        train_X, test_X, train_Y, test_Y = train_test_split(self.BaseMensagensDataFrame['text'], self.BaseMensagensDataFrame['target'], train_size = percent, shuffle = True)
        train_tokens = tokenizer(list(train_X), padding = True, truncation=True)
        test_tokens = tokenizer(list(test_X), padding = True, truncation=True)
        return BaseTreinamento(train_X, train_tokens, train_Y), BaseTeste(test_X, test_tokens, test_Y)

class BaseTreinamento(Dataset):
    def __init__(self, X, tokens, Y):
        self.text_data = X
        self.tokens = tokens
        self.labels = list(Y)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

class BaseTeste(Dataset):
    def __init__(self, X, tokens, Y):
        self.text_data = X
        self.tokens = tokens
        self.labels = list(Y)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

class BertHandler:
    def __init__(self, train, test, p_batch_size = 40, bert_pretrained_model = 'bert-base-cased'):
        self.batch_size = p_batch_size
        self.train_loader = DataLoader(train, shuffle=True, batch_size=self.batch_size)
        self.test_loader = DataLoader(test, shuffle=True, batch_size=self.batch_size)
        self.bert_model = BertForSequenceClassification.from_pretrained(bert_pretrained_model) # Pre-trained model
        self.optimizer = AdamW(self.bert_model.parameters(), lr=1e-5) # Optimization function
        self.loss_fn = torch.nn.CrossEntropyLoss() # Loss function
        
    def PrintMachineSpecs(self):
        import platform
        import psutil
        import cpuinfo
        import GPUtil
        import subprocess
        
        # Informações do sistema operacional
        print("===== SISTEMA OPERACIONAL =====")
        print(f"Sistema: {platform.system()} {platform.release()}")
        print(f"Arquitetura: {platform.architecture()[0]}")
        print(f"Nome do computador: {platform.node()}")
        
        # Informações da CPU
        print("\n===== CPU =====")
        cpu_info = cpuinfo.get_cpu_info()
        print(f"Nome da CPU: {cpu_info['brand_raw']}")
        print(f"Núcleos físicos: {psutil.cpu_count(logical=False)}")
        print(f"Núcleos lógicos: {psutil.cpu_count(logical=True)}")
        print(f"Frequência atual: {psutil.cpu_freq().current / 1000:.2f} GHz")
        
        # Informações de memória RAM
        print("\n===== MEMÓRIA RAM =====")
        mem = psutil.virtual_memory()
        print(f"Total: {mem.total / (1024 ** 3):.2f} GB")
        result = subprocess.run(
            ['powershell.exe', '-Command',
             "Get-CimInstance Win32_PhysicalMemory | Measure-Object -Property Capacity -Sum | % { '{0:N2}' -f ($_.Sum / 1GB) }"],
            capture_output=True,
            text=True
        )
        print(f"RAM Física Total (via PowerShell): {result.stdout.strip()} GB")
        
        # Informações da GPU
        print("\n===== GPU =====")
        gpus = GPUtil.getGPUs()
        if gpus:
            for gpu in gpus:
                print(f"Nome: {gpu.name}")
                print(f"Memória Total: {gpu.memoryTotal} MB")
                print(f"Driver: {gpu.driver}")
                print(f"ID: {gpu.id}")
        else:
            print("Nenhuma GPU dedicada detectada.")

    def RunTrainAndTest(self, num_epochs = 3):
        from datetime import datetime
        device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        self.bert_model.to(device) # Transfer model to GPU if available
        print('##############################################################')
        print('############### BERT TRAINING AND TEST PROCESS ###############')
        print('##############################################################')
        print('')
        self.PrintMachineSpecs()
        print('')
        begin = datetime.now()
        print('############### BEGIN: '+begin.strftime("%d/%m/%Y %H:%M:%S")+' ###############')
        for epoch in range(num_epochs):
            print('')
            print("############### EPOCH: ",(epoch + 1))
            # TRAINING BLOCK STARTS
            self.bert_model.train()
            for i,batch in enumerate(self.train_loader):    
                batch = {k: v.to(device) for k, v in batch.items()}
                
                # Setting the gradients to zero
                self.optimizer.zero_grad()
                
                # Passing the data to the model
                outputs = self.bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
                
                # The logits will be used for measuring the loss
                pred = outputs.logits
                loss = self.loss_fn(pred, batch['labels'])
        
                # Calculating the gradient for the loss function
                loss.backward()
                
                # Optimizing the parameters of the bert model
                self.optimizer.step()
        
                # Calculating the running loss for logging purposes
                train_batch_loss = loss.item()
                train_last_loss = train_batch_loss / self.batch_size
        
                print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
            # Logging epoch-wise training loss
            print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
            # TRAINING BLOCK ENDS 
            
            # TESTING BLOCK STARTS
            self.bert_model.eval()
            correct = 0
            test_pred = []
            for i, batch in enumerate(self.test_loader):
                batch = {k: v.to(device) for k, v in batch.items()}
                
                # We don't need gradients for testing
                with torch.no_grad():
                    outputs = self.bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
                
                # Logits act as predictions
                logits = outputs.logits
                
                # Calculating total batch loss using the logits and labels
                loss = self.loss_fn(logits, batch['labels'])
                test_batch_loss = loss.item()
                
                # Calculating the mean batch loss
                test_last_loss = test_batch_loss / self.batch_size
                print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))
                
                # Comparing the predicted target with the labels in the batch
                correct += (logits.argmax(1) == batch['labels']).sum().item()
                print("Testing accuracy: ",correct/((i + 1) * self.batch_size))
            
            total_test_samples = len(self.test_loader.dataset)
            final_accuracy = correct / total_test_samples
            print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)
            print(f"Final Testing Accuracy (Epoch {epoch + 1}): {final_accuracy:.4f}")
            # TESTING BLOCK ENDS
        end = datetime.now()
        print('############### END: '+begin.strftime("%d/%m/%Y %H:%M:%S")+' ###############')
        elapsed_minutes = (end - begin).total_seconds() / 60
        print(f"############### TOTAL ELAPSED TIME: {elapsed_minutes:.2f} minutes ###############")


[nltk_data] Downloading package punkt to /home/aletyska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/aletyska/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [4]:
spam_sms = pd.read_csv(path)

In [5]:
base = BaseMensagens(spam_sms)

In [6]:
train, test = base.Exec(0.7)

In [7]:
bert = BertHandler(train, test)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
bert.PrintMachineSpecs()

===== SISTEMA OPERACIONAL =====
Sistema: Linux 6.6.87.1-microsoft-standard-WSL2
Arquitetura: 64bit
Nome do computador: GAMING-SERVER

===== CPU =====
Nome da CPU: AMD Ryzen 7 2700 Eight-Core Processor
Núcleos físicos: 8
Núcleos lógicos: 16
Frequência atual: 3.39 GHz

===== MEMÓRIA RAM =====
Total: 15.58 GB
RAM Física Total (via PowerShell): 32,00 GB

===== GPU =====
Nome: NVIDIA GeForce RTX 3060
Memória Total: 12288.0 MB
Driver: 572.83
ID: 0


In [9]:
bert.RunTrainAndTest(num_epochs=1)

##############################################################
############### BERT TRAINING AND TEST PROCESS ###############
##############################################################

===== SISTEMA OPERACIONAL =====
Sistema: Linux 6.6.87.1-microsoft-standard-WSL2
Arquitetura: 64bit
Nome do computador: GAMING-SERVER

===== CPU =====
Nome da CPU: AMD Ryzen 7 2700 Eight-Core Processor
Núcleos físicos: 8
Núcleos lógicos: 16
Frequência atual: 3.39 GHz

===== MEMÓRIA RAM =====
Total: 15.58 GB
RAM Física Total (via PowerShell): 32,00 GB

===== GPU =====
Nome: NVIDIA GeForce RTX 3060
Memória Total: 12288.0 MB
Driver: 572.83
ID: 0

############### BEGIN: 14/06/2025 17:07:31 ###############

############### EPOCH:  1
Training batch 1 last loss: 0.024144372344017027
Training batch 2 last loss: 0.02541147470474243
Training batch 3 last loss: 0.024397845566272735
Training batch 4 last loss: 0.02365110218524933
Training batch 5 last loss: 0.021578845381736756
Training batch 6 last loss: 0.0208