In [1]:
!pip install simpletransformers pandas scikit-learn torch

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.43.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (

In [2]:
import pandas as pd
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Load dataset
df = pd.read_csv("/content/filtered_sentiment_dataset.csv")

In [4]:
# Map sentiment labels to numerical values
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
reverse_label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
df["sentiment"] = df["sentiment"].map(label_map)

In [5]:
# Drop NaN values
df.dropna(inplace=True)

In [6]:
# Split dataset into train (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["sentiment"])
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["sentiment"])

In [7]:
# Rename columns to match simpletransformers expectations
train_df = train_df.rename(columns={"text": "text", "sentiment": "labels"})
valid_df = valid_df.rename(columns={"text": "text", "sentiment": "labels"})
test_df = test_df.rename(columns={"text": "text", "sentiment": "labels"})

In [8]:
# Convert labels to integer type
train_df["labels"] = train_df["labels"].astype(int)
valid_df["labels"] = valid_df["labels"].astype(int)
test_df["labels"] = test_df["labels"].astype(int)


In [9]:
# Define model arguments
model_args = ClassificationArgs()
model_args.num_train_epochs = 7
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.learning_rate = 5e-6
model_args.weight_decay = 0.01
model_args.overwrite_output_dir = True
model_args.save_best_model = True
model_args.evaluate_during_training = True
model_args.use_early_stopping = True
model_args.early_stopping_patience = 2
model_args.output_dir = "muril-sentiment-model"
model_args.best_model_dir = "muril-best-model"

In [10]:
# Initialize the MuRIL model
model = ClassificationModel(
    "bert", "google/muril-base-cased",
    num_labels=3,
    args=model_args,
    use_cuda=torch.cuda.is_available()
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

In [11]:
# Train the model
model.train_model(train_df, eval_df=valid_df)

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 4 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 5 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 6 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 7 of 7:   0%|          | 0/630 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


(4410,
 defaultdict(list,
             {'global_step': [630,
               1260,
               1890,
               2000,
               2520,
               3150,
               3780,
               4000,
               4410],
              'train_loss': [0.5902099609375,
               0.364105224609375,
               0.6353657841682434,
               0.4076690673828125,
               0.5882466435432434,
               0.466461181640625,
               0.448486328125,
               0.5507392883300781,
               0.2102152556180954],
              'mcc': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
              'eval_loss': [0.6549929848200158,
               0.49705183468287506,
               0.47252323469029195,
               0.4703284154964399,
               0.4416508089892472,
               0.43336416007597234,
               0.4313452998294106,
               0.4306125561647777,
               0.42935838993591596]}))

In [12]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(valid_df, acc=accuracy_score)
print(f"Model Accuracy: {result['acc']:.4f}")


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/79 [00:00<?, ?it/s]

  with amp.autocast():


Model Accuracy: 0.8381


In [13]:
# Function to predict sentiment
def predict_sentiment(text, model):
    predictions, raw_outputs = model.predict([text])
    confidence = torch.softmax(torch.tensor(raw_outputs[0]), dim=0).tolist()
    sentiment = reverse_label_map[predictions[0]]
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\nConfidence Scores: {confidence}\n")
    return sentiment

In [14]:
# Predict on test set
for text in test_df["text"].tolist()[:10]:
    predict_sentiment(text, model)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Text:  Thala  a sami  I am wit
Predicted Sentiment: Positive
Confidence Scores: [0.10203440184017111, 0.012640818269233554, 0.8853247798905953]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Indha bgm kekumpodhu viswasam bgm mind la vandhutu pogudhu..
Predicted Sentiment: Positive
Confidence Scores: [0.34358192501914053, 0.19620369253463898, 0.4602143824462206]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Idhan da Acting ! Kola Mass ! Saar!
Predicted Sentiment: Positive
Confidence Scores: [0.11203164996584278, 0.013842131760000887, 0.8741262182741564]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Yenna da last seen ha phone la potrite mode la yedutha maari irruku
Predicted Sentiment: Positive
Confidence Scores: [0.3427780340519141, 0.1890286858393391, 0.4681932801087467]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  oruthar mela nenga viswasath katarathuga enoruthar Yan asingapaduthiringa.
Predicted Sentiment: Positive
Confidence Scores: [0.10143327389947347, 0.012637107309347915, 0.8859296187911787]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  thala en singama vera level  h.vinoth yuvanshankarraja boney kapoor  ajith kumar pandey on level. let's see nerkonda paarvai
Predicted Sentiment: Positive
Confidence Scores: [0.1037280015003806, 0.01269007198719116, 0.8835819265124282]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  ennada panni vachu irukinga siva va avaruku ethuku intha pomma dressla pottu comedy pandringa
Predicted Sentiment: Positive
Confidence Scores: [0.3436851046886123, 0.19349718299006619, 0.46281771232132146]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Semma thala oruther meala neega viswasam katrathukkaga innoruthara yen neega asingam paduthuringa.... Semma thala .
Predicted Sentiment: Positive
Confidence Scores: [0.10122440406423307, 0.012725519743837406, 0.8860500761919295]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Ivan comedian aatchay epde hero Anaaan comedy hero subject tan sari Sivakarthigeyn kku
Predicted Sentiment: Positive
Confidence Scores: [0.30087164455201576, 0.10519449194625981, 0.5939338635017245]



0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Text:  Thalaivernna naama super Strdahan pinnittaru trailere keddha irukkunna padam vandha vera leval
Predicted Sentiment: Positive
Confidence Scores: [0.11043818963570312, 0.013550636680356356, 0.8760111736839404]



In [15]:
# Save the trained model
model.save_model("my_muril_sentiment_model")