In [2]:
!pip install simpletransformers pandas scikit-learn torch


Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (

In [3]:
import pandas as pd
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [7]:
sentence_df = pd.read_csv("/content/nlp/codemix_sentiment_data.csv")

In [8]:
print("Dataset Preview:")
display(sentence_df.head())

Dataset Preview:


Unnamed: 0,Sentence,Sentiment
0,We need Mr chari 's review on master,NTL
1,worst government . #YSRCP chala chethha ga par...,NEG
2,bayya nuvvu emina cheppu kani bagoledu ani che...,NEG
3,Dube gadini vadilesi manchhi Pani chesaru @RCB...,POS
4,I came to watch thyview 's review crying after...,POS


In [9]:
sentence_df = sentence_df[['Sentence', 'Sentiment']]
sentence_df.dropna(inplace=True)

In [10]:
sentence_df.columns = sentence_df.columns.str.strip()

In [11]:
sentence_df["Sentiment"] = sentence_df["Sentiment"].astype(str).str.strip()

In [12]:
label_map = {"NEG": 0, "NTL": 1, "POS": 2}
sentence_df["Sentiment"] = sentence_df["Sentiment"].map(label_map)

In [13]:
sentence_df = sentence_df.dropna()

In [14]:
sentence_df["Sentiment"] = sentence_df["Sentiment"].astype(int)

In [15]:
print("\nDataset After Preprocessing:")
display(sentence_df.head())


Dataset After Preprocessing:


Unnamed: 0,Sentence,Sentiment
0,We need Mr chari 's review on master,1
1,worst government . #YSRCP chala chethha ga par...,0
2,bayya nuvvu emina cheppu kani bagoledu ani che...,0
3,Dube gadini vadilesi manchhi Pani chesaru @RCB...,2
4,I came to watch thyview 's review crying after...,2


In [16]:
print("\nData Types:")
print(sentence_df.dtypes)


Data Types:
Sentence     object
Sentiment     int64
dtype: object


In [17]:
train_df, eval_df = train_test_split(sentence_df, test_size=0.2, random_state=42)

In [18]:
print(f"Training Samples: {len(train_df)}")
print(f"Testing Samples: {len(eval_df)}")

Training Samples: 15894
Testing Samples: 3974


In [19]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 3
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.learning_rate = 2e-5
model_args.overwrite_output_dir = True
model_args.save_best_model = True
model_args.evaluate_during_training = True
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False
model_args.output_dir = "muril-sentiment-model"
model_args.best_model_dir = "muril-best-model"


In [20]:
model = ClassificationModel(
    "bert", "google/muril-base-cased",
    num_labels=len(label_map),
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

print("✅ MURIL Model Loaded Successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

✅ MURIL Model Loaded Successfully!


In [21]:
print("\n🚀 Training Started...")
model.train_model(train_df, eval_df=eval_df)




🚀 Training Started...




  0%|          | 0/31 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 3:   0%|          | 0/994 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/7 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 3:   0%|          | 0/994 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/7 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 3:   0%|          | 0/994 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/7 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/7 [00:00<?, ?it/s]

  with amp.autocast():


(2982,
 defaultdict(list,
             {'global_step': [994, 1988, 2000, 2982],
              'train_loss': [0.5352376103401184,
               0.253387451171875,
               0.7332191467285156,
               0.5455830693244934],
              'mcc': [0.606973300308224,
               0.6806034489459505,
               0.6848220505491199,
               0.6985815731353369],
              'eval_loss': [0.7255941081238559,
               0.5327300565788545,
               0.5303617064732624,
               0.516906503213936]}))

In [22]:
print("\n🔍 Evaluating Model...")
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=accuracy_score)


🔍 Evaluating Model...




  0%|          | 0/7 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/249 [00:00<?, ?it/s]

  with amp.autocast():


In [23]:
print(f"\nModel Accuracy: {result['acc']:.4f}")


Model Accuracy: 0.8037


In [24]:
predictions, raw_outputs = model.predict(eval_df["Sentence"].tolist())
print("\n📊 Classification Report:\n")
print(classification_report(eval_df["Sentiment"], predictions))

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/249 [00:00<?, ?it/s]

  with amp.autocast():



📊 Classification Report:

              precision    recall  f1-score   support

           0       0.87      0.84      0.85      1546
           1       0.61      0.73      0.67       800
           2       0.86      0.80      0.83      1628

    accuracy                           0.80      3974
   macro avg       0.78      0.79      0.78      3974
weighted avg       0.81      0.80      0.81      3974



In [25]:
sample_texts = ["@cheeks4042 Jimmy valla owner paapa antey naakishtam kaani aa ammayi ki rocky valla owner antey istam . So ala okasari pakka veedhi maxy valla owner tho ee paapa gurinchi aara teesthuntey telsindi lekapothey andari vishayaalu telusukodaniki naakem avasaram ?"]
predictions, raw_outputs = model.predict(sample_texts)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


In [26]:
reverse_label_map = {0: "NEG", 1: "NTL", 2: "POS"}
predictions = [reverse_label_map[p] for p in predictions]

In [27]:
print("\n💡 Sample Predictions:")
for text, pred in zip(sample_texts, predictions):
    print(f"📝 Text: {text} --> Predicted Sentiment: {pred}")


💡 Sample Predictions:
📝 Text: @cheeks4042 Jimmy valla owner paapa antey naakishtam kaani aa ammayi ki rocky valla owner antey istam . So ala okasari pakka veedhi maxy valla owner tho ee paapa gurinchi aara teesthuntey telsindi lekapothey andari vishayaalu telusukodaniki naakem avasaram ? --> Predicted Sentiment: POS


In [28]:
model.save_model("my_simpletransformers_model")