# **Required Libraries**

In [1]:
pip install transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# **Loading Data**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DebertaForSequenceClassification, DebertaTokenizer
from datasets import Dataset

# Load the data again and check the structure
data = pd.read_excel('/content/Reviews Classification_output.xlsx')

# Mapping labels to integers
label_map = {
    "Positive Feedback": 0,
    "Product Recommendations": 0,
    "Product Quality Issue": 1,
    "Authenticity and Originality Concerns": 1,
    "Delivery and Logistics Issues": 1,
    "Customer Service Experience": 1,
    "Returns, Refunds, and Exchange Issues": 1,
    "Scam Complaint": 1,
    "Pricing and Billing Issues": 1,
    "Warranty and Support Complaints": 1,
}

# Create a new column for numerical labels
data['label'] = data['Review_CAT'].map(label_map)

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Convert to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_data[['description', 'label']])
val_dataset = Dataset.from_pandas(val_data[['description', 'label']])


# **Tokenizing Data**

In [3]:
from transformers import DebertaTokenizer

# Initialize the tokenizer
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['description'], padding=True, truncation=True)

# Apply tokenization to datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure the correct format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

# **Initialize the Model**

In [None]:
import torch
from transformers import DebertaForSequenceClassification, Trainer, TrainingArguments

# Force the default tensor type to float32
torch.set_default_dtype(torch.float32)

# Load DeBERTa-base model with num_labels argument
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)

# Set up training arguments for CPU with explicit FP32
training_args = TrainingArguments(
    output_dir='./results',  # Output directory for model and checkpoints
    evaluation_strategy="steps",  # Evaluate after every N steps
    save_strategy="steps",  # Save model every N steps
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=1,  # Batch size for training
    per_device_eval_batch_size=1,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs
    weight_decay=0.01,  # Regularization
    logging_dir='./logs',  # Directory for logs
    logging_steps=100,  # Log every 100 steps
    save_steps=200,  # Save checkpoint every 200 steps
    load_best_model_at_end=True,  # Load the best model based on evaluation
    no_cuda=True,  # Force training on CPU
    fp16=False,  # Disable mixed precision
    bf16=False,  # Disable BFloat16 precision
    report_to="none"  # Disable WANDB if you don't want it
)

# Ensure model's data type is float32
model.to(torch.float32)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model on CPU
trainer.train()


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
100,0.9965,1.335884


Step,Training Loss,Validation Loss
100,0.9965,1.335884
200,1.3564,1.186346
300,1.4755,1.152564
