In [None]:
import pandas as pd

df = pd.read_csv("data")

In [None]:
df

In [None]:
import matplotlib.pyplot as plt

In [None]:
df['label_name'].value_counts(ascending=True).plot.bar()

In [None]:
df['word_count'] = df['text'].str.split().apply(len)
df.boxplot('word_count', 'label_name') 

In [None]:
import os

os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'False'

In [None]:
from transformers import AutoTokenizer

In [None]:
model_ckpt = "data"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3, stratify=df['label_name']) 
test, validation = train_test_split(test, test_size=1/3, stratify=test['label_name'])

train.shape, test.shape, validation.shape

In [None]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict(
    {'train':Dataset.from_pandas(train, preserve_index=False),
     'test':Dataset.from_pandas(test, preserve_index=False),
     'validation': Dataset.from_pandas(validation, preserve_index=False)
     }
     
)

dataset

In [None]:
def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True)
    return temp

In [None]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
dataset_encoded

In [None]:
label2id = {x['label_name']:x['label'] for x in dataset['train']}
id2label = {v:k for k,v in label2id.items()}

In [None]:
label2id

In [None]:
from transformers import AutoModel 
import torch

In [None]:
model0 = AutoModel.from_pretrained(model_ckpt)

In [None]:
model0.config.label2id

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig

num_labels = len(label2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

In [None]:
model.config

In [None]:
from transformers import TrainingArguments

batch_size = 64
training_dir = "data"

training_args = TrainingArguments( output_dir=training_dir,
                                  overwrite_output_dir = True,
                                  num_train_epochs = 2,
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  weight_decay = 0.01,
                                  eval_strategy = 'epoch',
                                  disable_tqdm = False
)

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics_evaluate(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)

    return {"accuracy": acc, "f1": f1}


In [None]:
from transformers import Trainer

trainer = Trainer(model=model, 
                  args=training_args,
                  compute_metrics=compute_metrics_evaluate,
                  train_dataset = dataset_encoded['train'], 
                  eval_dataset = dataset_encoded['validation'],
                  processing_class = tokenizer)

In [None]:
trainer.train()

In [None]:
preds_output = trainer.predict(dataset_encoded['test'])
preds_output.metrics

In [None]:
y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = dataset_encoded['test'][:]['label']

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=label2id.keys()) 
disp.plot(cmap='GnBu')

In [None]:
text = "I am super happy today. I got it done. Finally!!"

def get_prediction(text):
    input_encoded = tokenizer(text, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = model(**input_encoded)

    logits = outputs.logits
    pred = torch.argmax(logits, dim=1).item()

    return id2label[pred]

get_prediction(text)

In [None]:
trainer.save_model("data")

In [None]:
from transformers import pipeline

classifier = pipeline('text-classification', model= 'data')

classifier([text, 'hello, how are you?', "love you", "i am feeling low"])