In [1]:
!pip install simpletransformers pandas scikit-learn torch

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.43.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (

In [3]:
import pandas as pd
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [4]:
# Load dataset
df = pd.read_csv("/content/filtered_sentiment_dataset.csv")

In [5]:
# Map sentiment labels to numerical values
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
reverse_label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
df["sentiment"] = df["sentiment"].map(label_map)


In [6]:
# Drop NaN values
df.dropna(inplace=True)

In [7]:
# Split dataset into train (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["sentiment"])
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["sentiment"])

In [11]:
# Rename columns to match simpletransformers expectations
train_df = train_df.rename(columns={"text": "text", "sentiment": "labels"})
valid_df = valid_df.rename(columns={"text": "text", "sentiment": "labels"})
test_df = test_df.rename(columns={"text": "text", "sentiment": "labels"})

In [12]:
# Convert labels to integer type
train_df["labels"] = train_df["labels"].astype(int)
valid_df["labels"] = valid_df["labels"].astype(int)
test_df["labels"] = test_df["labels"].astype(int)


In [13]:
# Define model arguments
model_args = ClassificationArgs()
model_args.num_train_epochs = 7
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.learning_rate = 5e-6
model_args.weight_decay = 0.01
model_args.overwrite_output_dir = True
model_args.save_best_model = True
model_args.evaluate_during_training = True
model_args.use_early_stopping = True
model_args.early_stopping_patience = 2
model_args.output_dir = "xlm-roberta-sentiment-model"
model_args.best_model_dir = "xlm-roberta-best-model"

In [14]:
# Initialize the model
model = ClassificationModel(
    "xlmroberta", "xlm-roberta-base",
    num_labels=3,
    args=model_args,
    use_cuda=torch.cuda.is_available()
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Train the model
model.train_model(train_df, eval_df=valid_df)

Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 4 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 5 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 6 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 7 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


(4410,
 defaultdict(list,
             {'global_step': [630,
               1260,
               1890,
               2000,
               2520,
               3150,
               3780,
               4000,
               4410],
              'train_loss': [0.8716583251953125,
               0.617340087890625,
               0.38429513573646545,
               0.23548507690429688,
               0.18587493896484375,
               0.4386393129825592,
               0.113037109375,
               0.1146087646484375,
               0.2571563720703125],
              'mcc': [0.0,
               0.24793396104372475,
               0.330903457826062,
               0.35852521594309406,
               0.32216612778657283,
               0.2884870811392113,
               0.36945968597239875,
               0.3434913024955744,
               0.3683016346274413],
              'eval_loss': [0.43737960352173333,
               0.3975611257402203,
               0.38539276432387437,
           

In [16]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(valid_df, acc=accuracy_score)
print(f"Model Accuracy: {result['acc']:.4f}")

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/79 [00:00<?, ?it/s]

  with amp.autocast():


Model Accuracy: 0.8460


In [17]:
# Function to predict sentiment
def predict_sentiment(text, model):
    predictions, raw_outputs = model.predict([text])
    confidence = torch.softmax(torch.tensor(raw_outputs[0]), dim=0).tolist()
    sentiment = reverse_label_map[predictions[0]]
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\nConfidence Scores: {confidence}\n")
    return sentiment

In [18]:
# Predict on test set
for text in test_df["text"].tolist()[:10]:
    predict_sentiment(text, model)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Text:  Thala  a sami  I am wit
Predicted Sentiment: Positive
Confidence Scores: [0.012395127434668616, 0.0002900147467142518, 0.987314857818617]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Indha bgm kekumpodhu viswasam bgm mind la vandhutu pogudhu..
Predicted Sentiment: Negative
Confidence Scores: [0.7437238357615265, 0.00125226138823547, 0.25502390285023796]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Idhan da Acting ! Kola Mass ! Saar!
Predicted Sentiment: Positive
Confidence Scores: [0.027041129415944068, 0.00021317143891421954, 0.9727456991451419]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Yenna da last seen ha phone la potrite mode la yedutha maari irruku
Predicted Sentiment: Negative
Confidence Scores: [0.8233380623062633, 0.0017051948481216685, 0.17495674284561508]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  oruthar mela nenga viswasath katarathuga enoruthar Yan asingapaduthiringa.
Predicted Sentiment: Positive
Confidence Scores: [0.020131801744766732, 0.0002249858396632309, 0.9796432124155701]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  thala en singama vera level  h.vinoth yuvanshankarraja boney kapoor  ajith kumar pandey on level. let's see nerkonda paarvai
Predicted Sentiment: Positive
Confidence Scores: [0.04549120400623864, 0.000210768958252341, 0.9542980270355089]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  ennada panni vachu irukinga siva va avaruku ethuku intha pomma dressla pottu comedy pandringa
Predicted Sentiment: Negative
Confidence Scores: [0.8345046113912985, 0.0016524001941229738, 0.1638429884145786]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Semma thala oruther meala neega viswasam katrathukkaga innoruthara yen neega asingam paduthuringa.... Semma thala .
Predicted Sentiment: Positive
Confidence Scores: [0.016344300265583853, 0.00022497515563057736, 0.9834307245787856]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Ivan comedian aatchay epde hero Anaaan comedy hero subject tan sari Sivakarthigeyn kku
Predicted Sentiment: Negative
Confidence Scores: [0.7473439783996997, 0.0013473856831231154, 0.2513086359171771]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Thalaivernna naama super Strdahan pinnittaru trailere keddha irukkunna padam vandha vera leval
Predicted Sentiment: Positive
Confidence Scores: [0.2219405685939567, 0.00037884154893590063, 0.7776805898571074]

