# **Required Libraries**

In [15]:
pip install transformers datasets torch scikit-learn




# **Loading Data**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DebertaForSequenceClassification, DebertaTokenizer
from datasets import Dataset

# Load the data again and check the structure
data = pd.read_excel('/content/Reviews Classification_output.xlsx')

# Mapping labels to integers
label_map = {
    "Positive Feedback": 0,
    "Product Recommendations": 0,
    "Product Quality Issue": 1,
    "Authenticity and Originality Concerns": 1,
    "Delivery and Logistics Issues": 1,
    "Customer Service Experience": 1,
    "Returns, Refunds, and Exchange Issues": 1,
    "Scam Complaint": 1,
    "Pricing and Billing Issues": 1,
    "Warranty and Support Complaints": 1,
}

# Create a new column for numerical labels
data['label'] = data['Review_CAT'].map(label_map)

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Convert to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_data[['description', 'label']])
val_dataset = Dataset.from_pandas(val_data[['description', 'label']])


# **Tokenizing Data**

In [3]:
from transformers import DebertaTokenizer

# Initialize the tokenizer
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['description'], padding=True, truncation=True)

# Apply tokenization to datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure the correct format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

# **Model Architectures with Initialization**

In [8]:
from transformers import TrainingArguments, Trainer
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import DebertaForSequenceClassification, DebertaTokenizer

# Initialize the DeBERTa model for sequence classification with 2 labels
model = DebertaForSequenceClassification.from_pretrained(
    'microsoft/deberta-base',  # Load pre-trained DeBERTa model architecture
    num_labels=2,              # Set the number of labels for classification task
    hidden_dropout_prob=0.2,   # Dropout rate for hidden layers
    attention_probs_dropout_prob=0.2  # Dropout rate for attention layers
)


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Length Recognition**

---



In [9]:
print(len(train_dataset))  # Should be 1080


1080


# **Creation Of Results Folder In Drive For Checkpoints Saving (Only Once)**

In [None]:
import os
# Create a 'results' folder inside your Google Drive (if not already created)
os.makedirs('/content/drive/MyDrive/results', exist_ok=True)


# **Existing Results Checkpoint Sending To Drive (Only Once If Needed)**

In [None]:
import shutil

# Define the local path of the 'checkpoint_50' folder
checkpoint_local_path = '/content/results/checkpoint-50'  # Update this path with your actual path

# Define the destination path in Google Drive
destination_path = '/content/drive/MyDrive/results/checkpoints/checkpoint-50'

# Copy the 'checkpoint_50' folder to Google Drive
shutil.copytree(checkpoint_local_path, destination_path)
print(f'Checkpoint folder "checkpoint-50" has been copied to Google Drive.')


Checkpoint folder "checkpoint-50" has been copied to Google Drive.


# **Training Arguments (Used with Trainer)**

In [None]:
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/results/checkpoints',  # Save model in the same folder
    evaluation_strategy="steps",  # Evaluate every N steps
    save_strategy="steps",  # Save model every N steps
    learning_rate=1e-5,  # Lower learning rate
    per_device_train_batch_size=4,  # Use a smaller batch size of 1
    per_device_eval_batch_size=4,  # Same for eval
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate a larger batch size
    num_train_epochs=3,  # Training for 3 epochs
    weight_decay=0.2,# Increased weight decay for regularization
    max_grad_norm=1.0,# Clip gradients to avoid explosion
    logging_dir='./logs',
    eval_steps=50,# Match the frequency of evaluation to logging
    logging_steps=50,  # Log every 50 steps
    save_steps=50,  # Save model every 50 steps
    load_best_model_at_end=True,  # Load the best model based on validation loss
    no_cuda=True,  # Force CPU training
    fp16=False,  # Disable mixed precision
    bf16=False,
    report_to="none",  # Disable reporting to W&B
)

# Early Stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=10)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping]  # Add early stopping
)

# Path to the checkpoint folder
checkpoint_path = '/content/drive/MyDrive/results/checkpoints/checkpoint-450'

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=checkpoint_path)


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
