In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from torch.utils.data import Dataset
import torch
from nltk.corpus import wordnet
import re
import random

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [5]:
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_to_replace = random.choice(words)
        synonyms = wordnet.synsets(word_to_replace)
        if synonyms:
            synonym = random.choice(synonyms).lemmas()[0].name()
            new_words = [synonym if w == word_to_replace else w for w in new_words]
    return " ".join(new_words)

In [6]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
df["text"] = df["text"].apply(clean_text)

In [7]:
df= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
y_df=df[["complaint","demands","praise","questions"]]
y_array=np.asarray(y_df)
df["output"]=np.argmax(y_array,axis=1)
df.head()

Unnamed: 0,id,text,complaint,demands,praise,questions,output
0,500796286320,Wow! From what I've observed from this documen...,0.0,0.0,1.0,0.0,2
1,838906157157,काय रे dungnat मेंदु असणाऱ्या आंधभक्ता तुझा आई...,1.0,0.0,0.0,0.0,0
2,1011026626743,अजित दादा आणि प्रफुल्ल पटेल यांनी केलेल्या काम...,0.0,0.0,1.0,0.0,2
3,1068853499446,"She's saying that ""doing her own research"" led...",1.0,0.0,0.0,0.0,0
4,502772748919,"That is not Karen, that is perfectly reasonabl...",1.0,0.0,0.0,0.0,0


In [8]:
augmented_texts = df["text"].apply(lambda x: synonym_replacement(x))
augmented_labels = df["output"]
df_augmented = pd.DataFrame({"text": augmented_texts, "output": augmented_labels})
df = pd.concat([df, df_augmented], ignore_index=True)

In [9]:
texts = df["text"].values
labels = df["output"].values
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.33, random_state=42, stratify=labels
)

In [10]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long),
        }

In [11]:

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [12]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=128)
test_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length=128)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels)))

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    probs = torch.nn.functional.softmax(torch.tensor(pred.predictions), dim=1).numpy()
    roc_auc = roc_auc_score(labels, probs, multi_class="ovr", average="weighted")
    f1 = f1_score(labels, preds, average="weighted")
    accuracy = accuracy_score(labels, preds)
    return {"roc_auc": roc_auc, "f1": f1, "accuracy": accuracy}

In [15]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/Colab Notebooks/logs",
    logging_steps=100,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="roc_auc",
)

In [16]:
import os
os.environ["WANDB_DISABLED"] = "false"

In [17]:
!pip install wandb



In [18]:
print("Training model...")
import wandb
wandb.init(mode="disabled")

Training model...


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [20]:
trainer.train()



Epoch,Training Loss,Validation Loss,Roc Auc,F1,Accuracy
1,1.3427,0.952663,0.840818,0.556991,0.641667
2,0.8025,0.613891,0.921563,0.771773,0.773864
3,0.5292,0.529005,0.944923,0.825155,0.823106
4,0.3816,0.403624,0.966394,0.864827,0.864015
5,0.1927,0.368071,0.97345,0.890131,0.889773


TrainOutput(global_step=840, training_loss=0.6290546116374788, metrics={'train_runtime': 922.8257, 'train_samples_per_second': 29.041, 'train_steps_per_second': 0.91, 'total_flos': 1762875726643200.0, 'train_loss': 0.6290546116374788, 'epoch': 5.0})

In [21]:
trainer.evaluate()

{'eval_loss': 0.3680707514286041,
 'eval_roc_auc': 0.9734499380235065,
 'eval_f1': 0.8901306101902835,
 'eval_accuracy': 0.8897727272727273,
 'eval_runtime': 18.9769,
 'eval_samples_per_second': 139.116,
 'eval_steps_per_second': 8.695,
 'epoch': 5.0}

In [22]:
test_data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")

In [23]:
test_input=list(test_data["text"])
test_input_tokenized=tokenizer(test_input,padding="max_length",truncation=True,max_length=128, return_tensors="pt").to("cuda")

In [24]:
import torch
from torch.utils.data import DataLoader


test_dataset = torch.utils.data.TensorDataset(*[test_input_tokenized[key] for key in test_input_tokenized])
test_loader = DataLoader(test_dataset, batch_size=4)  # Adjust batch size as needed


all_preds = []
for batch in test_loader:
    batch = {k: v.to("cuda") for k, v in zip(test_input_tokenized, batch)}

    with torch.no_grad():
        outputs = model(**batch)
        test_preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
        test_preds = test_preds.cpu().detach().numpy()
        all_preds.append(test_preds)

test_preds = np.concatenate(all_preds, axis=0)

In [25]:
submission=pd.DataFrame(columns=df.columns)
submission=submission.drop(["text","output"],axis=1)
test_ids=[id for id in test_data["id"]]
submission["id"]=test_ids
submission[["complaint","demands","praise","questions"]]=test_preds
submission.head()

Unnamed: 0,id,complaint,demands,praise,questions
0,1041016773991,0.012869,0.97348,0.011226,0.002425
1,109362481297,0.002809,0.001382,0.99496,0.000849
2,985019053532,0.968169,0.002602,0.026857,0.002372
3,436629695381,0.989994,0.001107,0.004213,0.004686
4,585196067684,0.008569,0.978621,0.008709,0.004101


In [26]:
submission.to_csv("/content/drive/MyDrive/Colab Notebooks/result.csv",index=False)