In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip uninstall clr
!pip install pythonnet
!pip install sentencepiece

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('corrected_labelled.csv')
df = df.dropna()

df['nps'] = df['nps'].replace('10 (Extremely likely)',10)
df['nps'] = df['nps'].replace('0 (Not at all likely)',0)
df['nps'] = df['nps'].astype(int)

#target variable will nps split into demoters, passives and promoters
df['label'] = np.where(df['nps'] == 3,2,
                  np.where(df['nps'] == 1,0,1))

gb = df.groupby('label')['label'].count()
print(gb)

df.info()

label
0    411
1    159
2    565
Name: label, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1135 entries, 0 to 1134
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   response       1135 non-null   object
 1   nps            1135 non-null   int64 
 2   Term           1135 non-null   object
 3   ticket_number  1135 non-null   object
 4   grp            1135 non-null   object
 5   label          1135 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 53.3+ KB


In [3]:
import re
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text.lower()

df['response'] = df['response'].apply(lambda x: text_preprocessing(x))
df['Term'] = df['Term'].apply(lambda x: text_preprocessing(x))

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df[['response','Term']],df['label'], test_size = 0.33, random_state=42)

#X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size = 0.5, random_state=42)

In [5]:
#calculating class weights
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight = 'balanced', classes = [2,1,0], y = y_train)
type(class_weights)

numpy.ndarray

In [6]:
import datasets
from datasets import Dataset, DatasetDict

train = X_train
train['label'] = y_train.to_list()

# test = X_test
# test['label'] = y_test.to_list()

val = X_val
val['label'] = y_val.to_list()

train_ds = Dataset.from_pandas(train)
#test_ds = Dataset.from_pandas(test)
val_ds = Dataset.from_pandas(val)

ds = DatasetDict()

ds['train'] = train_ds
#ds['test'] = test_ds
ds['validation'] = val_ds
ds

DatasetDict({
    train: Dataset({
        features: ['response', 'Term', 'label', '__index_level_0__'],
        num_rows: 760
    })
    validation: Dataset({
        features: ['response', 'Term', 'label', '__index_level_0__'],
        num_rows: 375
    })
})

In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

#raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "yangheng/deberta-v3-large-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)


def tokenize_function(example):
    return tokenizer(example["response"], example["Term"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Map:   0%|          | 0/760 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/375 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [9]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [10]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 39]),
 'token_type_ids': torch.Size([8, 39]),
 'attention_mask': torch.Size([8, 39])}

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [12]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.5544, grad_fn=<NllLossBackward0>) torch.Size([8, 3])


In [13]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [14]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

285


In [15]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [16]:

from sklearn.metrics import precision_recall_fscore_support

def evaluate_model(model, dataloader, loss_fn, device):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    all_predicted_labels = []
    all_true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            b_labels = batch['labels']
            b_input_ids = batch['input_ids']
            b_attn_mask = batch['attention_mask']

            logits = model(**batch)
            loss = loss_fn(logits['logits'], b_labels)

            total_loss += loss.item()

            # Calculate accuracy
            _, predicted_labels = torch.max(logits['logits'], 1)
            correct_predictions += (predicted_labels == b_labels).sum().item()
            total_samples += b_labels.size(0)

            all_predicted_labels.extend(predicted_labels.cpu().numpy())
            all_true_labels.extend(b_labels.cpu().numpy())

    average_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_samples

    # Calculate weighted F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(b_labels.cpu().numpy(), predicted_labels.cpu().numpy(), average='weighted')

    return average_loss, accuracy, precision, recall, f1


In [17]:
from tqdm.auto import tqdm
import torch.nn as nn

progress_bar = tqdm(range(num_training_steps))

tensor_weights = torch.tensor(np.float32(class_weights)).to(device)
loss_fn = nn.CrossEntropyLoss(weight = tensor_weights)
model.train()

epoch_list = []
step_list = []
train_loss_list = []
val_loss_list = []
val_acc_list = []
val_recall_list = []
val_prec_list = []
val_f1_list = []



evaluate_every_n_steps = 20
for epoch in range(num_epochs):
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        b_labels = batch['labels']
        b_input_ids = batch['input_ids']
        b_attn_mask = batch['attention_mask']
        #logits = model(b_input_ids, b_attn_mask,)
        logits = model(**batch)
        loss = loss_fn(logits['logits'],b_labels)
        #print(f'training loss: {loss}')

        #validation loss
        if ((step) % evaluate_every_n_steps == 0) or (step == len(train_dataloader) - 1):
          val_loss, val_accuracy, val_prec, val_recall, val_f1 = evaluate_model(model, eval_dataloader, loss_fn, device)
          print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
          # Optionally, you can store the loss and accuracy values for later plotting.
          # Add code for plotting as needed.
          epoch_list.append(epoch + 1)
          step_list.append(step)
          train_loss_list.append(loss.item())
          val_loss_list.append(val_loss)
          val_acc_list.append(val_accuracy)
          val_recall_list.append(val_recall)
          val_prec_list.append(val_prec)
          val_f1_list.append(val_f1)


        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

loss_info = pd.DataFrame(list(zip(epoch_list, step_list, train_loss_list, val_loss_list, val_acc_list, val_recall_list, val_prec_list, val_f1_list)),
                         columns = ['epoch', 'step', 'train loss', 'val loss', 'val accuracy','val recall', 'val precision', 'val f1'])

  0%|          | 0/285 [00:00<?, ?it/s]

Epoch 1/3, Validation Loss: 1.0467167766071221, Validation Accuracy: 0.744
Epoch 1/3, Validation Loss: 0.6916489236532374, Validation Accuracy: 0.7626666666666667
Epoch 1/3, Validation Loss: 0.43571691322041317, Validation Accuracy: 0.9013333333333333
Epoch 1/3, Validation Loss: 0.6173002088957644, Validation Accuracy: 0.7946666666666666
Epoch 1/3, Validation Loss: 0.506768118827901, Validation Accuracy: 0.8666666666666667
Epoch 1/3, Validation Loss: 0.631233866227434, Validation Accuracy: 0.7653333333333333
Epoch 2/3, Validation Loss: 0.7917457216597618, Validation Accuracy: 0.6853333333333333
Epoch 2/3, Validation Loss: 0.45834497195926116, Validation Accuracy: 0.8533333333333334
Epoch 2/3, Validation Loss: 0.4003767813060512, Validation Accuracy: 0.8533333333333334
Epoch 2/3, Validation Loss: 0.4056404688336114, Validation Accuracy: 0.8426666666666667
Epoch 2/3, Validation Loss: 0.4247716198814042, Validation Accuracy: 0.84
Epoch 2/3, Validation Loss: 0.44704357764505326, Validation

In [18]:
loss_info

Unnamed: 0,epoch,step,train loss,val loss,val accuracy,val recall,val precision,val f1
0,1,0,1.455238,1.046717,0.744,1.0,1.0,1.0
1,1,20,0.688485,0.691649,0.762667,0.857143,0.928571,0.866667
2,1,40,0.304885,0.435717,0.901333,1.0,1.0,1.0
3,1,60,0.811877,0.6173,0.794667,0.714286,0.904762,0.714286
4,1,80,0.237457,0.506768,0.866667,1.0,1.0,1.0
5,1,94,0.880067,0.631234,0.765333,0.857143,0.928571,0.866667
6,2,0,0.56913,0.791746,0.685333,0.857143,0.892857,0.853061
7,2,20,0.390966,0.458345,0.853333,1.0,1.0,1.0
8,2,40,0.313118,0.400377,0.853333,1.0,1.0,1.0
9,2,60,0.046857,0.40564,0.842667,1.0,1.0,1.0


In [None]:
len(eval_dataloader)


11

In [19]:
import evaluate

#metric = evaluate.load("glue", "mrpc", average = "weighted")
# metric = evaluate.combine([
#     evaluate.load("precision", average="macro"),
#     evaluate.load("recall", average="macro")
# ])

metric = evaluate.load("accuracy")
model.eval()

all_logits = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    all_logits.append(logits)

    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(metric.compute())


all_logits = torch.cat(all_logits, dim=0)
probs = torch.argmax(all_logits, dim=1).cpu().numpy()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.8773333333333333}


In [27]:
from sklearn.metrics import confusion_matrix, classification_report

y_test = val['label'].to_numpy()

y_pred = val['predicted_class'].to_numpy()

class_labels = ['Negative','Neutral','Positive']

# Create the confusion matrix with custom labels
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[103  27   0]
 [  8  29   6]
 [  1   4 197]]


In [28]:
# Display the classification report with custom labels
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=class_labels))


Classification Report:
              precision    recall  f1-score   support

    Negative       0.92      0.79      0.85       130
     Neutral       0.48      0.67      0.56        43
    Positive       0.97      0.98      0.97       202

    accuracy                           0.88       375
   macro avg       0.79      0.81      0.80       375
weighted avg       0.90      0.88      0.88       375



In [20]:
probs
val['predicted_class'] = probs

#val.to_csv('check.csv')

In [21]:
val

Unnamed: 0,response,Term,label,predicted_class
784,coach was delayed by an hour causing me to mis...,delayed,0,0
900,journey ringwood to london the coach was far t...,journey,1,0
413,punctuality,punctuality,2,2
467,since the seats are too tight and extremely fi...,seats,0,0
746,the 2nd stage of the journey to folkestone was...,late,0,0
...,...,...,...,...
33,i was pleased with the comfort of the bus and ...,phone,1,1
754,it was a good experience despite the late wait...,delay,0,0
575,always late,late,0,0
259,"friendly staff, efficient, reasonable price",price,2,2


**Saving checkpoint to drive**

In [None]:
#### Saving ###
from google.colab import drive
drive.mount('/content/gdrive')
#%cd /content/gdrive/My\ Drive/FOLDER

model_save_name = 'asba_classifier.pt'
path = F"/content/gdrive/MyDrive/{model_save_name}"
#path = F"/Shared drives/CRM & Insight/Analysis/arun/Text_Analytics"
torch.save(model.state_dict(), path)

Mounted at /content/gdrive


**Predicting term and response absa sentiment**

Going from pandas df with responses and terms, converting data types to get model outputs and then putting it back into the dataframe.

In [None]:
#preprocessing to pass into bert
df = pd.read_csv('emotion_append.csv')
df = df.dropna(subset = ['response','Term'])

df['nps'] = df['nps'].replace('10 (Extremely likely)',10)
df['nps'] = df['nps'].replace('0 (Not at all likely)',0)
df['nps'] = df['nps'].astype(int)

#target variable will nps split into demoters, passives and promoters
df['label'] = np.where(df['nps'] >= 9,2,
                  np.where(df['nps'] <= 6,0,1))

df['response'] = df['response'].apply(lambda x: text_preprocessing(x))
df['Term'] = df['Term'].apply(lambda x: text_preprocessing(x))

In [None]:
prob_df = df[['response','Term', 'label']]

ds = Dataset.from_pandas(prob_df)

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "yangheng/deberta-v3-large-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["response"], example["Term"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
#detokenizing inputs
# token_output = []
# for tok in tokenized_datasets['input_ids']:
#   tokens = []
#   detokenized = tokenizer.decode(tok)
#   token_output.append(detokenized)

In [None]:
#tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term"])
tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")


In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    tokenized_datasets, shuffle=True, batch_size=8, collate_fn=data_collator
)
for batch in  dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 52]),
 'token_type_ids': torch.Size([8, 52]),
 'attention_mask': torch.Size([8, 52])}

In [None]:
# import torch

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
# device

In [None]:
all_logits = []
input_ids = []
for batch in dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    input_ids.append(batch['input_ids'])
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    all_logits.append(logits)

all_logits = torch.cat(all_logits, dim=0)
probs = torch.argmax(all_logits, dim=1).cpu().numpy()
probs

array([2, 0, 0, ..., 0, 2, 1])

In [None]:
token_output = []
for tok in tokenized_datasets['input_ids']:
  tokens = []
  detokenized = tokenizer.decode(tok)
  token_output.append(detokenized)

In [None]:
token_output = []
for batch in input_ids:
  for id in batch:
      token_output.append(tokenizer.decode(id))

In [None]:
token_output
check = pd.DataFrame(list(zip(token_output)), columns = ['detokenized'])
check.to_csv('check.csv')

In [None]:
df['aspect_sentiment'] = probs
df.to_csv('asba_append.csv')

In [None]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ticket_number,response,nps,csat,Term,Tier_2,Tier_1,entities,entity_type,entity_start,entity_end,num,emotion,label,aspect_sentiment
0,0,0,CSA03746,i think the driver was kevin. lgw to lhr. an a...,10,Very satisfied,driver,Staff,People,"{'entity': 'B-PER', 'score': 0.99710387, 'inde...",B-PER,23,28,1,neutral,2,2
1,1,16195,EUAJWU72,the trip from portsmouth to london was affecte...,7,Somewhat dissatisfied,delayed,Time,Service,"{'entity': 'B-PER', 'score': 0.9990357, 'index...",B-PER,125,132,1,joy,1,0
2,2,7926,EUADWE24,left on time and the driver (tom) was very pol...,10,Very satisfied,time,Time,Service,"{'entity': 'B-PER', 'score': 0.9992095, 'index...",B-PER,29,32,1,neutral,2,0
3,3,13909,EUAHLT03,exceptionally friendly & humorous drivers (not...,10,Very satisfied,journey,Journey,Service,"{'entity': 'B-PER', 'score': 0.9753795, 'index...",B-PER,79,81,1,joy,2,1
4,4,16196,EUAJWU72,the trip from portsmouth to london was affecte...,7,Somewhat dissatisfied,driver,Staff,People,"{'entity': 'B-PER', 'score': 0.9990357, 'index...",B-PER,125,132,1,joy,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,3639,6424,EUADAR88,"arrived at liverpool,the coach was delayed so ...",0,Very dissatisfied,arrived,Journey,Service,"{'entity': 'B-LOC', 'score': 0.8416962, 'index...",B-LOC,11,20,2,sadness,0,0
3640,3640,6423,EUADAR88,"arrived at liverpool,the coach was delayed so ...",0,Very dissatisfied,delayed,Time,Service,"{'entity': 'B-LOC', 'score': 0.8416962, 'index...",B-LOC,11,20,2,sadness,0,2
3641,3641,6422,EUADAR74,coach on time at heathrow and arrived on time ...,8,Very satisfied,arrived,Journey,Service,"{'entity': 'I-LOC', 'score': 0.9968671, 'index...",I-LOC,109,111,2,sadness,1,0
3642,3642,6481,EUADBM85,to get to london it took us 6 hours. then comi...,1,Very dissatisfied,hour,Time,Service,"{'entity': 'B-LOC', 'score': 0.99972266, 'inde...",B-LOC,10,16,2,surprise,0,2
