In [1]:
import transformers
print(transformers.__version__)

4.52.4


Loading Library

In [2]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

Data Preprocessing

In [16]:
df = pd.read_csv("merged_output.csv")
df = df.dropna(subset=['text', 'label'])
df['text'] = df['text'].astype(str)
df = df[df['text'].str.strip() != '']
df = df[df['text'] != 'nan']
df = df[df['label'].notna()]
df = df[~df['label'].isin([np.inf, -np.inf]) if df['label'].dtype in ['float64', 'float32'] else df['label'].notna()]
df['label'] = df['label'].astype(str)
label_list = sorted(df['label'].unique())
label2id = {str(label): idx for idx, label in enumerate(label_list)}
id2label = {idx: str(label) for label, idx in label2id.items()}
df['labels'] = df['label'].map(label2id)
df = df[df['labels'].notna()]
df['labels'] = df['labels'].astype(int)


Loading Model

In [17]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id={v: k for k, v in id2label.items()}
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocessing The Dataset

In [18]:
def preprocess(examples):
    texts = [str(text) if text is not None else "" for text in examples["text"]]
    tokenized = tokenizer(texts, truncation=True, padding=True, max_length=128)
    return tokenized
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

trainer Arguments

In [19]:
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

Model Training

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


Trainer and Inference

In [21]:
trainer.train()
trainer.save_model("finetuned-sentiment-model")
tokenizer.save_pretrained("finetuned-sentiment-model")

Epoch,Training Loss,Validation Loss
1,0.3779,0.215253
2,0.1366,0.231724
3,0.0745,0.231725


('finetuned-sentiment-model\\tokenizer_config.json',
 'finetuned-sentiment-model\\special_tokens_map.json',
 'finetuned-sentiment-model\\vocab.txt',
 'finetuned-sentiment-model\\added_tokens.json',
 'finetuned-sentiment-model\\tokenizer.json')

In [3]:
model_path = "finetuned-sentiment-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

id2label = model.config.id2label  

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax().item() 
        return predicted_class_id

output_class = predict_sentiment(input("Enter the Text to analyze the sentiment:"))

print("Positive" if output_class == 1 else "Negative")


Positive
