In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip uninstall clr
!pip install pythonnet
!pip install sentencepiece

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('pjs_train_labelled.csv')
df = df.dropna()

df['nps'] = df['nps'].replace('10 (Extremely likely)',10)
df['nps'] = df['nps'].replace('0 (Not at all likely)',0)
df['nps'] = df['nps'].astype(int)

#target variable will nps split into demoters, passives and promoters
df['label'] = np.where(df['nps'] == 3,2,
                  np.where(df['nps'] == 1,0,1))


In [None]:
import re
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text.lower()

df['response'] = df['response'].apply(lambda x: text_preprocessing(x))
df['Term'] = df['Term'].apply(lambda x: text_preprocessing(x))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[['response','Term']],df['label'], test_size = 0.33, random_state=42)

X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size = 0.5, random_state=42)

In [None]:
#calculating class weights
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight = 'balanced', classes = [2,1,0], y = y_train)
type(class_weights)

numpy.ndarray

In [None]:
import datasets
from datasets import Dataset, DatasetDict

train = X_train
train['label'] = y_train.to_list()

test = X_test
test['label'] = y_test.to_list()

val = X_val
val['label'] = y_val.to_list()

train_ds = Dataset.from_pandas(train)
test_ds = Dataset.from_pandas(test)
val_ds = Dataset.from_pandas(val)

ds = DatasetDict()

ds['train'] = train_ds
ds['test'] = test_ds
ds['validation'] = val_ds
ds

DatasetDict({
    train: Dataset({
        features: ['response', 'Term', 'label', '__index_level_0__'],
        num_rows: 337
    })
    test: Dataset({
        features: ['response', 'Term', 'label', '__index_level_0__'],
        num_rows: 83
    })
    validation: Dataset({
        features: ['response', 'Term', 'label', '__index_level_0__'],
        num_rows: 83
    })
})

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

#raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "yangheng/deberta-v3-large-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)


def tokenize_function(example):
    return tokenizer(example["response"], example["Term"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Map:   0%|          | 0/337 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/83 [00:00<?, ? examples/s]

Map:   0%|          | 0/83 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 61]),
 'token_type_ids': torch.Size([8, 61]),
 'attention_mask': torch.Size([8, 61])}

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.0936, grad_fn=<NllLossBackward0>) torch.Size([8, 3])


In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

129


In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
from tqdm.auto import tqdm
import torch.nn as nn

progress_bar = tqdm(range(num_training_steps))

tensor_weights = torch.tensor(np.float32(class_weights)).to(device)
loss_fn = nn.CrossEntropyLoss(weight = tensor_weights)
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        b_labels = batch['labels']
        b_input_ids = batch['input_ids']
        b_attn_mask = batch['attention_mask']
        #logits = model(b_input_ids, b_attn_mask,)
        logits = model(**batch)
        loss = loss_fn(logits['logits'],b_labels)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/129 [00:00<?, ?it/s]

In [None]:
import evaluate

#metric = evaluate.load("glue", "mrpc", average = "weighted")
# metric = evaluate.combine([
#     evaluate.load("precision", average="macro"),
#     evaluate.load("recall", average="macro")
# ])

metric = evaluate.load("accuracy")
model.eval()

all_logits = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    all_logits.append(logits)

    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(metric.compute())


all_logits = torch.cat(all_logits, dim=0)
probs = torch.argmax(all_logits, dim=1).cpu().numpy()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.8554216867469879}


In [None]:
probs
val['predicted_class'] = probs
val.to_csv('check.csv')

**Saving checkpoint to drive**

In [None]:
#### Saving ###
from google.colab import drive
drive.mount('/content/gdrive')
#%cd /content/gdrive/My\ Drive/FOLDER

model_save_name = 'asba_classifier.pt'
path = F"/content/gdrive/MyDrive/{model_save_name}"
#path = F"/Shared drives/CRM & Insight/Analysis/arun/Text_Analytics"
torch.save(model.state_dict(), path)

Mounted at /content/gdrive


**Predicting term and response absa sentiment**

Going from pandas df with responses and terms, converting data types to get model outputs and then putting it back into the dataframe.

In [None]:
#preprocessing to pass into bert
df = pd.read_csv('emotion_append.csv')
df = df.dropna(subset = ['response','Term'])

df['nps'] = df['nps'].replace('10 (Extremely likely)',10)
df['nps'] = df['nps'].replace('0 (Not at all likely)',0)
df['nps'] = df['nps'].astype(int)

#target variable will nps split into demoters, passives and promoters
df['label'] = np.where(df['nps'] >= 9,2,
                  np.where(df['nps'] <= 6,0,1))

df['response'] = df['response'].apply(lambda x: text_preprocessing(x))
df['Term'] = df['Term'].apply(lambda x: text_preprocessing(x))

In [None]:
prob_df = df[['response','Term', 'label']]

ds = Dataset.from_pandas(prob_df)

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "yangheng/deberta-v3-large-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["response"], example["Term"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
#detokenizing inputs
# token_output = []
# for tok in tokenized_datasets['input_ids']:
#   tokens = []
#   detokenized = tokenizer.decode(tok)
#   token_output.append(detokenized)

In [None]:
#tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term"])
tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")


In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    tokenized_datasets, shuffle=True, batch_size=8, collate_fn=data_collator
)
for batch in  dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 52]),
 'token_type_ids': torch.Size([8, 52]),
 'attention_mask': torch.Size([8, 52])}

In [None]:
# import torch

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
# device

In [None]:
all_logits = []
input_ids = []
for batch in dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    input_ids.append(batch['input_ids'])
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    all_logits.append(logits)

all_logits = torch.cat(all_logits, dim=0)
probs = torch.argmax(all_logits, dim=1).cpu().numpy()
probs

array([2, 0, 0, ..., 0, 2, 1])

In [None]:
token_output = []
for tok in tokenized_datasets['input_ids']:
  tokens = []
  detokenized = tokenizer.decode(tok)
  token_output.append(detokenized)

In [None]:
token_output = []
for batch in input_ids:
  for id in batch:
      token_output.append(tokenizer.decode(id))

In [None]:
token_output
check = pd.DataFrame(list(zip(token_output)), columns = ['detokenized'])
check.to_csv('check.csv')

In [None]:
df['aspect_sentiment'] = probs
df.to_csv('asba_append.csv')

In [None]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ticket_number,response,nps,csat,Term,Tier_2,Tier_1,entities,entity_type,entity_start,entity_end,num,emotion,label,aspect_sentiment
0,0,0,CSA03746,i think the driver was kevin. lgw to lhr. an a...,10,Very satisfied,driver,Staff,People,"{'entity': 'B-PER', 'score': 0.99710387, 'inde...",B-PER,23,28,1,neutral,2,2
1,1,16195,EUAJWU72,the trip from portsmouth to london was affecte...,7,Somewhat dissatisfied,delayed,Time,Service,"{'entity': 'B-PER', 'score': 0.9990357, 'index...",B-PER,125,132,1,joy,1,0
2,2,7926,EUADWE24,left on time and the driver (tom) was very pol...,10,Very satisfied,time,Time,Service,"{'entity': 'B-PER', 'score': 0.9992095, 'index...",B-PER,29,32,1,neutral,2,0
3,3,13909,EUAHLT03,exceptionally friendly & humorous drivers (not...,10,Very satisfied,journey,Journey,Service,"{'entity': 'B-PER', 'score': 0.9753795, 'index...",B-PER,79,81,1,joy,2,1
4,4,16196,EUAJWU72,the trip from portsmouth to london was affecte...,7,Somewhat dissatisfied,driver,Staff,People,"{'entity': 'B-PER', 'score': 0.9990357, 'index...",B-PER,125,132,1,joy,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,3639,6424,EUADAR88,"arrived at liverpool,the coach was delayed so ...",0,Very dissatisfied,arrived,Journey,Service,"{'entity': 'B-LOC', 'score': 0.8416962, 'index...",B-LOC,11,20,2,sadness,0,0
3640,3640,6423,EUADAR88,"arrived at liverpool,the coach was delayed so ...",0,Very dissatisfied,delayed,Time,Service,"{'entity': 'B-LOC', 'score': 0.8416962, 'index...",B-LOC,11,20,2,sadness,0,2
3641,3641,6422,EUADAR74,coach on time at heathrow and arrived on time ...,8,Very satisfied,arrived,Journey,Service,"{'entity': 'I-LOC', 'score': 0.9968671, 'index...",I-LOC,109,111,2,sadness,1,0
3642,3642,6481,EUADBM85,to get to london it took us 6 hours. then comi...,1,Very dissatisfied,hour,Time,Service,"{'entity': 'B-LOC', 'score': 0.99972266, 'inde...",B-LOC,10,16,2,surprise,0,2
