<a href="https://colab.research.google.com/github/yogasgm/prototype_finetuning_pytorch/blob/main/Prototype_Multilabel_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Importing libraries

In [None]:
!pip install transformers

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random
import shutil
import sys
from sklearn.model_selection import train_test_split



In [None]:
import pandas as pd
df = pd.read_excel('/content/labeled_data_fix20250424_172710.xlsx')

# Assuming 'df' from previous cells is your data:
train_df = df.copy()  # Create a copy of 'df' and name it 'train_df'

In [None]:
# Assuming your CSV file is named 'All Data_labeled.csv'
file_path = '/content/labeled_data_fix20250424_172710.xlsx'  # Assign the path to a variable
train_df = pd.read_excel(file_path) # Use the variable in pd.read_csv

# Setting seed for reproducibility

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
set_seed(43)

# Downloading dataset

In [None]:
# Load the dataset directly from the Colab file system
file_path = '/content/labeled_data_fix20250424_172710.xlsx'  # Adjust the filename as needed
train_df = pd.read_excel(file_path)

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16849 entries, 0 to 16848
Data columns (total 6 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   text                            16849 non-null  object
 1   Excessive Resource Consumption  16849 non-null  int64 
 2   Waste Mismanagement             16849 non-null  int64 
 3   Plastic Pollution               16849 non-null  int64 
 4   Fossil Fuel Dependence          16849 non-null  int64 
 5   Food Waste                      16849 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 789.9+ KB


In [None]:
train_df.columns

Index(['text', 'Excessive Resource Consumption', 'Waste Mismanagement',
       'Plastic Pollution', 'Fossil Fuel Dependence', 'Food Waste'],
      dtype='object')

# Selecting required columns

In [None]:
train_df = train_df[['text', 'Excessive Resource Consumption', 'Waste Mismanagement',
       'Plastic Pollution', 'Fossil Fuel Dependence', 'Food Waste']]

In [None]:
target_list = ['Excessive Resource Consumption', 'Waste Mismanagement',
       'Plastic Pollution', 'Fossil Fuel Dependence', 'Food Waste']

# Preparing the tokenizer

In [None]:
#Set Max Lenght, maksimal 512 (BERT)
MAX_LEN = 512

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
#download the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['text']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

# Splitting & Tokenizing Dataset

In [None]:
# Adjusting the train/validation/test split
train_df, temp_df = train_test_split(train_df, test_size=0.2, random_state=43)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=43)

# Reset the indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
# Label distribution in the training set
train_counts = train_df[target_list].sum(axis=0)
print("Label distribution in the training set:\n", train_counts)

# Label distribution in the validation set
val_counts = val_df[target_list].sum(axis=0)
print("\nLabel distribution in the validation set:\n", val_counts)

# Label distribution in the test set
test_counts = test_df[target_list].sum(axis=0)
print("\nLabel distribution in the test set:\n", test_counts)

Label distribution in the training set:
 Excessive Resource Consumption    2004
Waste Mismanagement               7332
Plastic Pollution                 3881
Fossil Fuel Dependence            2922
Food Waste                         395
dtype: int64

Label distribution in the validation set:
 Excessive Resource Consumption    240
Waste Mismanagement               943
Plastic Pollution                 482
Fossil Fuel Dependence            339
Food Waste                         49
dtype: int64

Label distribution in the test set:
 Excessive Resource Consumption    264
Waste Mismanagement               936
Plastic Pollution                 487
Fossil Fuel Dependence            371
Food Waste                         50
dtype: int64


In [None]:
# Label distribution in the training set
train_counts_percentage = (train_df[target_list].sum(axis=0) / len(train_df)) * 100
print("Label distribution in the training set:\n", train_counts_percentage)

# Label distribution in the validation set
val_counts_percentage = (val_df[target_list].sum(axis=0) / len(val_df)) * 100
print("\nLabel distribution in the validation set:\n", val_counts_percentage)

# Label distribution in the test set
test_counts_percentage = (test_df[target_list].sum(axis=0) / len(test_df)) * 100
print("\nLabel distribution in the test set:\n", test_counts_percentage)

Label distribution in the training set:
 Excessive Resource Consumption    14.867572
Waste Mismanagement               54.395727
Plastic Pollution                 28.792937
Fossil Fuel Dependence            21.678166
Food Waste                         2.930484
dtype: float64

Label distribution in the validation set:
 Excessive Resource Consumption    14.243323
Waste Mismanagement               55.964392
Plastic Pollution                 28.605341
Fossil Fuel Dependence            20.118694
Food Waste                         2.908012
dtype: float64

Label distribution in the test set:
 Excessive Resource Consumption    15.667656
Waste Mismanagement               55.548961
Plastic Pollution                 28.902077
Fossil Fuel Dependence            22.017804
Food Waste                         2.967359
dtype: float64


In [None]:
train_df.shape

(13479, 6)

In [None]:
val_df.shape

(1685, 6)

In [None]:
val_df

Unnamed: 0,text,Excessive Resource Consumption,Waste Mismanagement,Plastic Pollution,Fossil Fuel Dependence,Food Waste
0,as soon as the bacteria make their way into a ...,0,1,1,0,0
1,",what happened to the trash they vote for demo...",0,0,0,0,0
2,",why is the trash going to other states pollut...",0,1,0,0,0
3,",""world",0,0,0,0,0
4,",speaking of tradeoffs why not discuss the dam...",0,0,0,1,0
...,...,...,...,...,...,...
1680,",you can also own stuff in a circular economy ...",1,1,0,0,0
1681,",i might lose my sleep when i seriously start ...",0,1,1,0,0
1682,",when a zombie apocalypse occur i know who to ...",0,0,0,0,0
1683,",they need to build about more trash to power...",0,1,1,0,0


In [None]:
test_df

Unnamed: 0,text,Excessive Resource Consumption,Waste Mismanagement,Plastic Pollution,Fossil Fuel Dependence,Food Waste
0,",fyi tony soprano is the waste management con...",0,1,0,0,0
1,",use beforeexpiry dates are bs and people have...",0,0,0,0,1
2,",so plans of more energy recovery incinerators...",0,1,0,1,0
3,us trash is a huge issue in isolated asian cou...,0,1,1,0,0
4,and hats off to this recycling plant for all o...,0,1,0,0,0
...,...,...,...,...,...,...
1680,",why am i reminded of a scene from a popular s...",0,1,0,0,0
1681,i think the biggest reason why zero waste fail...,0,1,1,0,0
1682,",warning this video contains a false talking p...",0,0,0,1,0
1683,",aliens will visit the earth after we all kill...",0,1,1,0,0


In [None]:
# Create the CustomDataset for each set
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)

In [None]:
len(train_dataset)

13479

# Setting hyperparameters

In [None]:
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 5e-5

In [None]:
# Preparing the DataLoaders
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
# Checking for available device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

# Additional functions for loading and saving checkpoints

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

# Training the Model

Defining and Initializing the BERT Classification Model

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

Setting Up the Loss Function and Optimizer

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

Initialization of Validation Target and Output Lists

In [None]:
val_targets=[]
val_outputs=[]

Training and Validation Loop with Early Stopping

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path, patience):

  # initialize tracker for minimum validation loss
  valid_loss_min = np.inf # Change np.Inf to np.inf
  no_improve = 0


  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)

    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }


      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
              valid_loss_min,
              valid_loss
              ))
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
        no_improve = 0
      else:
        no_improve += 1
        if no_improve >= patience:
          print("Early stopping due to no improvement in validation loss")
          break

  return model

In [None]:
# Save checkpoint

ckpt_path = '/content/ckpt.pth'
best_model_path = '/content/best_model.pth'

# Start Train

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path, patience=2)

############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000296 	Average Validation Loss: 0.001947
Validation loss decreased (inf --> 0.001947).  Saving model ...
############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000169 	Average Validation Loss: 0.001823
Validation loss decreased (0.001947 --> 0.001823).  Saving model ...
############# Epoch 3: Training Start   #############
############# Epoch 3: Training End     #############
############# Epoch 3: Validation Start   #############
############# Epoch 3: Validation End     #############
Epoch: 3 	Avgerage Training Loss: 0.000101 	Average

In [None]:
# Load the saved checkpoint
model, optimizer, start_epoch, valid_loss_min = load_ckp(best_model_path, model, optimizer)

print(f'The validation loss of the best saved model is: {valid_loss_min}')

The validation loss of the best saved model is: 0.001823003637916404


# Test

In [None]:
# Process new dataset
#new_dataset = CustomDataset(new_df, tokenizer, MAX_LEN)
new_dataset = test_dataset

# Create DataLoader
new_data_loader = torch.utils.data.DataLoader(new_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

# Load the model
model, optimizer, start_epoch, valid_loss_min = load_ckp(best_model_path, model, optimizer)

# Switch model to the evaluation mode
model.eval()

new_outputs = []
new_targets = []
test_loss = 0.0

# Define loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

# Pass new data through the model
with torch.no_grad():
    for batch_idx, data in enumerate(new_data_loader):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        # Calculate loss
        loss = loss_fn(outputs, targets)
        test_loss += loss.item() * data['input_ids'].size(0)

        new_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        new_targets.extend(targets.cpu().detach().numpy().tolist())

# Average the test loss over all batches
test_loss = test_loss / len(new_data_loader.dataset)

print(f'Test Loss: {test_loss:.6f}')

Test Loss: 0.203400


In [None]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

# Convert the outputs and targets to numpy arrays
new_outputs_np = np.array(new_outputs)
new_targets_np = np.array(new_targets)

# Threshold the outputs (This depends on your requirements, 0.5 is used as an example)
new_outputs_bin = (new_outputs_np > 0.5)

# Calculate metrics
print(classification_report(new_targets_np, new_outputs_bin))

# Calculate macro and micro metrics
precision_macro = precision_score(new_targets_np, new_outputs_bin, average='macro')
recall_macro = recall_score(new_targets_np, new_outputs_bin, average='macro')
f1_macro = f1_score(new_targets_np, new_outputs_bin, average='macro')

precision_micro = precision_score(new_targets_np, new_outputs_bin, average='micro')
recall_micro = recall_score(new_targets_np, new_outputs_bin, average='micro')
f1_micro = f1_score(new_targets_np, new_outputs_bin, average='micro')

print(f'Macro Precision: {precision_macro} Macro Recall: {recall_macro} Macro F1: {f1_macro}')
print(f'Micro Precision: {precision_micro} Micro Recall: {recall_micro} Micro F1: {f1_micro}')

              precision    recall  f1-score   support

           0       0.78      0.50      0.61       264
           1       0.91      0.88      0.90       936
           2       0.84      0.87      0.85       487
           3       0.93      0.77      0.84       371
           4       0.68      0.60      0.64        50

   micro avg       0.88      0.80      0.84      2108
   macro avg       0.83      0.72      0.77      2108
weighted avg       0.88      0.80      0.83      2108
 samples avg       0.68      0.64      0.65      2108

Macro Precision: 0.8281528860397763 Macro Recall: 0.7236374977417556 Macro F1: 0.7676529882834838
Micro Precision: 0.8786936236391913 Micro Recall: 0.8040796963946869 Micro F1: 0.8397324746098588


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(new_targets_np, new_outputs_bin)

print(f'Accuracy: {accuracy}')

Accuracy: 0.7204747774480712


# **PREDICTION**

In [None]:
pip install transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification


In [None]:
# Replace with your actual model path and labels count
model_path = '/content/best_model.pth'  # Path to your saved checkpoint
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # Same tokenizer as used in training
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)  # Set num_labels to your class count


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch

# Path to your checkpoint file
checkpoint_path = '/content/ckpt.pth'

# Try loading the checkpoint
try:
    state_dict = torch.load(checkpoint_path, map_location=torch.device('cuda'))
    print("Checkpoint loaded successfully.")
    print("Keys:", state_dict.keys())
except Exception as e:
    print(f"Failed to load the checkpoint: {e}")


Checkpoint loaded successfully.
Keys: dict_keys(['epoch', 'valid_loss_min', 'state_dict', 'optimizer'])


In [None]:
import torch

# Load the checkpoint and inspect its contents
checkpoint_path = '/content/ckpt.pth'  # Replace with your checkpoint file path
state_dict = torch.load(checkpoint_path, map_location=torch.device('cuda'))  # Or 'cuda' if using GPU
print(state_dict.keys())


dict_keys(['epoch', 'valid_loss_min', 'state_dict', 'optimizer'])


In [None]:
print(state_dict['state_dict'].keys())


odict_keys(['bert_model.embeddings.word_embeddings.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_mode

In [None]:
import torch
from transformers import BertForSequenceClassification

# Path to your checkpoint file
checkpoint_path = '/content/ckpt.pth'

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Load the checkpoint
state_dict = torch.load(checkpoint_path, map_location=torch.device('cuda'))  # Use 'cuda' if using GPU

# Access the nested 'state_dict'
# This assumes your checkpoint saved the model's state_dict under the key 'state_dict'
state_dict = state_dict['state_dict']

# Load the model weights directly
model.load_state_dict(state_dict, strict=False) # strict=False to ignore unexpected keys

# Set the model to evaluation mode
model.eval()

print("Model successfully loaded and ready for evaluation.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model successfully loaded and ready for evaluation.


In [None]:
import numpy as np
def predict_in_batches(texts, batch_size=8):
    # Initialize lists to store predictions
    all_predictions = []

    # Process texts in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenize the batch
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        # Move to device (GPU/CPU)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Make predictions
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            batch_predictions = torch.sigmoid(logits)

        # Move predictions to CPU and convert to numpy
        batch_predictions = batch_predictions.cpu().numpy()
        all_predictions.extend(batch_predictions)

        # Clear GPU cache if using CUDA
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Optional: Add a progress indicator
        if i % (batch_size * 10) == 0:
            print(f"Processed {i}/{len(texts)} texts")

    return np.array(all_predictions)



In [None]:
texts = [
    "i would love to see s britghtmark recycling in every state in the usa",
    "this is very refreshing to see even though i do not care to eat meat anymore because it helped my chronic pains to go away and just improved my overall health even if i feel like eating something that tastes like meat i turn to plant based products i hate all this pollution in our environment"
]


In [None]:
predictions = predict_in_batches(texts, 8)


Processed 0/2 texts


In [None]:
for i, text in enumerate(texts):
    print(f"Text: {text}")
    print(f"Predictions: {predictions[i].tolist()}")  # Convert tensor to list for easier readability


Text: i would love to see s britghtmark recycling in every state in the usa
Predictions: [0.4493041932582855, 0.40787267684936523, 0.4879288673400879, 0.5299351215362549, 0.610774040222168]
Text: this is very refreshing to see even though i do not care to eat meat anymore because it helped my chronic pains to go away and just improved my overall health even if i feel like eating something that tastes like meat i turn to plant based products i hate all this pollution in our environment
Predictions: [0.5004604458808899, 0.33624985814094543, 0.49180617928504944, 0.5406094193458557, 0.571693480014801]


In [None]:
import pandas as pd

# Load the dataset
dataset_path = '/content/2_data_unlabeled_prediction.xlsx'  # Update with the path to your dataset
df = pd.read_excel(dataset_path)

# Display the first few rows of your dataset
print(df.head())

texts = df['text'].tolist()

                                                text
0  ,i would love to see s britghtmark recycling i...
1  ,this is very refreshing to see even though i ...
2  ,"that citizen bread to beer to bread is it re...
3  cause it first theyre taking wheat bread waste...
4  then they use this beer waste to make a second...


In [None]:
# Use the function with your data
batch_size = 8  # Adjust this based on your available RAM
predictions = predict_in_batches(texts, batch_size=batch_size)
threshold = 0.55
binary_predictions = (predictions > threshold).astype(int)

Processed 0/16849 texts
Processed 80/16849 texts
Processed 160/16849 texts
Processed 240/16849 texts
Processed 320/16849 texts
Processed 400/16849 texts
Processed 480/16849 texts
Processed 560/16849 texts
Processed 640/16849 texts
Processed 720/16849 texts
Processed 800/16849 texts
Processed 880/16849 texts
Processed 960/16849 texts
Processed 1040/16849 texts
Processed 1120/16849 texts
Processed 1200/16849 texts
Processed 1280/16849 texts
Processed 1360/16849 texts
Processed 1440/16849 texts
Processed 1520/16849 texts
Processed 1600/16849 texts
Processed 1680/16849 texts
Processed 1760/16849 texts
Processed 1840/16849 texts
Processed 1920/16849 texts
Processed 2000/16849 texts
Processed 2080/16849 texts
Processed 2160/16849 texts
Processed 2240/16849 texts
Processed 2320/16849 texts
Processed 2400/16849 texts
Processed 2480/16849 texts
Processed 2560/16849 texts
Processed 2640/16849 texts
Processed 2720/16849 texts
Processed 2800/16849 texts
Processed 2880/16849 texts
Processed 2960/16

In [None]:
# Convert predictions to a list of lists
df['predictions'] = [pred.tolist() for pred in predictions]


In [None]:
# Add predictions to the dataframe and save
df['predicted_labels'] = binary_predictions.tolist()
df.to_csv('/content/predicted_dataset.csv', index=False)
print("Predictions saved to '/content/predicted_dataset.csv'")

Predictions saved to '/content/predicted_dataset.csv'


# Test with New Input Text

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd


In [None]:
model_name = "bert-base-uncased"  # Replace with your model
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load model onto GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }


In [None]:
# Load test dataset
file_path = "/content/2_data_unlabeled_prediction.xlsx"  # Path to your test dataset
test_data = pd.read_excel(file_path)
texts = test_data["text"].tolist()  # Replace 'text' with the column name in your dataset

# Create DataLoader
MAX_LEN = 512
BATCH_SIZE = 16
test_dataset = TestDataset(texts, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
test_texts = df['text'].astype(str).tolist()


In [None]:
# Example: Preprocessing test data
import pandas as pd

# Load your dataset
df = pd.read_excel('/content/2_data_unlabeled_prediction.xlsx')

# Select the text column and ensure it's in string format
texts = df['text'].astype(str).tolist()  # Replace 'text_column' with the actual column name containing text

# Create DataLoader or preprocess texts directly
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 128

# Tokenize the texts
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')

tokenized_texts = [tokenize_function(text) for text in texts]

# Create a DataLoader if needed
test_loader = DataLoader(tokenized_texts, batch_size=16)

In [None]:
def predict(model, data_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            # Move 'input_ids' and 'attention_mask' to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Provide token_type_ids if they are present in the batch
            # otherwise, create a tensor of zeros with the same shape as input_ids
            if 'token_type_ids' in batch:
                token_type_ids = batch['token_type_ids'].to(device)
            else:
                token_type_ids = torch.zeros_like(input_ids, device=device)

            # Pass 'input_ids', 'attention_mask', and 'token_type_ids' to the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)
            predicted_classes = torch.argmax(probs, dim=-1)
            predictions.extend(predicted_classes.cpu().numpy())

    return predictions

In [None]:
def predict(model, data_loader):
    for batch in data_loader:
        if isinstance(batch, dict):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
        else:
            input_ids, attention_mask = batch[:2]  # if tuple or list
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)


In [None]:
# ipython-input-85-a0dc5aae3232
predicted_labels = predict(model, test_loader)

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# Add predictions to the original dataframe
test_data["predicted_label"] = predicted_labels

# Save to CSV
test_data.to_csv("/content/test_predictions.csv", index=False)
print("Predictions saved to 'test_predictions.csv'")


NameError: name 'predicted_labels' is not defined

In [None]:
def classify_text(model, text, tokenizer, max_len, threshold=0.5):
    # Prepare the text
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs["token_type_ids"].to(device)

    # Get the model outputs
    with torch.no_grad():
        outputs = model(input_ids, attention_mask, token_type_ids)

    # Convert to probabilities
    probabilities = torch.sigmoid(outputs).cpu().detach().numpy().tolist()

    # Define the class labels in the same order that the model was trained on
    class_labels = ['Excessive Resource Consumption', 'Waste Mismanagement', 'Plastic Pollution', 'Fossil Fuel Dependence', 'Food Waste']


    # Convert the probabilities to labels
    predicted_labels = [class_labels[i] for i, prob in enumerate(probabilities[0]) if prob > threshold]

    return probabilities, predicted_labels


In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)

Probabilities: [[0.013511438854038715, 0.012029381468892097, 0.003705111565068364, 0.006967440247535706, 0.004753862041980028]]
Predicted labels: []


In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)

Probabilities: [[0.012559311464428902, 0.007281941827386618, 0.001910100574605167, 0.007181008346378803, 0.0025175847113132477]]
Predicted labels: []


In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)

Probabilities: [[0.010650143958628178, 0.009747701697051525, 0.01013413816690445, 0.35011836886405945, 0.04104405641555786]]
Predicted labels: []


# **PREDICTION**

In [None]:
! pip install torch transformers pandas matplotlib openpyxl




In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
import matplotlib.pyplot as plt

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
import torch
from transformers import BertForSequenceClassification

# Define device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model architecture
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Load model weights
model_path = "/content/best_model.pth"  # Ensure the path and file are correct
try:
    state_dict = torch.load(model_path, map_location=device)
    model.load_state_dict(state_dict)
    model.to(device)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Failed to load model: {e}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Failed to load model: PytorchStreamReader failed reading zip archive: failed finding central directory


  state_dict = torch.load(model_path, map_location=device)


In [None]:
def predict(texts, model, tokenizer, batch_size=16):
    predictions = []

    # Process the data in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenize the input texts
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,  # Adjust based on your training setup
            return_tensors="pt"
        ).to(device)

        # Predict probabilities
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.sigmoid(logits).cpu().numpy()  # Use sigmoid for probabilities

        predictions.extend(probs)

    return np.array(predictions)


In [None]:
# Load the unlabelled dataset
dataset_path = "/content/2_data_unlabeled_prediction.xlsx"  # Replace with your dataset path
df = pd.read_excel(dataset_path)
texts = df['text'].tolist()  # Replace 'text' with the name of your text column


In [None]:
# Predict on the dataset
predictions = predict(texts, model, tokenizer)


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
# Define your labels
labels = ['Excessive Resource Consumption','Waste Mismanagement','Plastic Pollution','Fossil Fuel Dependence','Food Waste',]  # Replace with your labels

# Add predictions as new columns
for i, label in enumerate(labels):
    df[label] = predictions[:, i]

# Save the updated dataset
output_path = "predicted_dataset.xlsx"
df.to_excel(output_path, index=False)
print(f"Predictions saved to: {output_path}")


In [None]:
def plot_spider_chart(predictions, labels, title="Prediction Visualization"):
    angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle

    # Prepare the data for plotting
    values = predictions.tolist()
    values += values[:1]  # Complete the circle

    # Plot the spider chart
    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    ax.fill(angles, values, color='blue', alpha=0.25)
    ax.plot(angles, values, color='blue', linewidth=2)
    ax.set_yticks([])
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels)
    ax.set_title(title, size=16, pad=20)
    plt.show()

# Visualize the first prediction
plot_spider_chart(predictions[0], labels, title="Prediction for First Text")
