In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip uninstall clr
!pip install pythonnet
!pip install sentencepiece

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata
  Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.5/121.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.16.4 from https://files.pythonhosted.org/packages/ef/b5/b6107bd65fa4c96fdf00e4733e2fe5729bb9e5e09997f63074bb43d3ab28/huggingface_hub-0.18.0-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.18.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/8f/3e/4b8b40eb3c80aeaf360f0361d956d129bb3d23b2a3ecbe3a04a8f3bdd6d

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv(r'gs://absa-classification/pjs_train_labelled.csv')
df = df.dropna()

df['nps'] = df['nps'].replace('10 (Extremely likely)',10)
df['nps'] = df['nps'].replace('0 (Not at all likely)',0)
df['nps'] = df['nps'].astype(int)

#target variable will nps split into demoters, passives and promoters
df['label'] = np.where(df['nps'] == 3,2,
                  np.where(df['nps'] == 1,0,1))


In [3]:
import re
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text.lower()

df['response'] = df['response'].apply(lambda x: text_preprocessing(x))
df['Term'] = df['Term'].apply(lambda x: text_preprocessing(x))

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[['response','Term']],df['label'], test_size = 0.33, random_state=42)

X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size = 0.5, random_state=42)

In [5]:
#calculating class weights
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight = 'balanced', classes = [2,1,0], y = y_train)
type(class_weights)

numpy.ndarray

In [6]:
import datasets
from datasets import Dataset, DatasetDict

train = X_train
train['label'] = y_train.to_list()

test = X_test
test['label'] = y_test.to_list()

val = X_val
val['label'] = y_val.to_list()

train_ds = Dataset.from_pandas(train)
test_ds = Dataset.from_pandas(test)
val_ds = Dataset.from_pandas(val)

ds = DatasetDict()

ds['train'] = train_ds
ds['test'] = test_ds
ds['validation'] = val_ds
ds

DatasetDict({
    train: Dataset({
        features: ['response', 'Term', 'label', '__index_level_0__'],
        num_rows: 337
    })
    test: Dataset({
        features: ['response', 'Term', 'label', '__index_level_0__'],
        num_rows: 83
    })
    validation: Dataset({
        features: ['response', 'Term', 'label', '__index_level_0__'],
        num_rows: 83
    })
})

In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

#raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "yangheng/deberta-v3-large-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)


def tokenize_function(example):
    return tokenizer(example["response"], example["Term"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Map:   0%|          | 0/337 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/83 [00:00<?, ? examples/s]

Map:   0%|          | 0/83 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [9]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [10]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 52]),
 'token_type_ids': torch.Size([8, 52]),
 'attention_mask': torch.Size([8, 52])}

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [12]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.9707, grad_fn=<NllLossBackward0>) torch.Size([8, 3])


In [13]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [14]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

129


In [15]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [16]:
from tqdm.auto import tqdm
import torch.nn as nn

progress_bar = tqdm(range(num_training_steps))

tensor_weights = torch.tensor(np.float32(class_weights)).to(device)
loss_fn = nn.CrossEntropyLoss(weight = tensor_weights)
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        b_labels = batch['labels']
        b_input_ids = batch['input_ids']
        b_attn_mask = batch['attention_mask']
        #logits = model(b_input_ids, b_attn_mask,)
        logits = model(**batch)
        loss = loss_fn(logits['logits'],b_labels)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/129 [00:00<?, ?it/s]

In [21]:
import evaluate

#metric = evaluate.load("glue", "mrpc", average = "weighted")
# metric = evaluate.combine([
#     evaluate.load("precision", average="macro"),
#     evaluate.load("recall", average="macro")
# ])

metric = evaluate.load("accuracy")
model.eval()

all_logits = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    all_logits.append(logits)

    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])


accuracy = metric.compute()
print(accuracy['accuracy'])

all_logits = torch.cat(all_logits, dim=0)
probs = torch.argmax(all_logits, dim=1).cpu().numpy()

In [24]:
probs
val['predicted_class'] = probs
val

Unnamed: 0,response,Term,label,predicted_class
250,"was a fantastic trip, bus clean and unbelievab...",time,2,2
192,the coach was late departing by 30 minutes; we...,late,0,0
489,easy booking. good coach. pleasant staff. all ...,booking,2,2
148,the journey with national express was very str...,time,2,2
75,not based pleased with the journey. paid extra...,journey,1,0
...,...,...,...,...
358,punctuality,punctuality,2,2
464,bus on time and easy booking system. and reaso...,booking,2,2
227,it's fine drivers are good as far i have been ...,cheap,2,1
468,very late,late,0,1


**Saving checkpoint to drive**

In [None]:
#### Saving ###
from google.colab import drive
drive.mount('/content/gdrive')
#%cd /content/gdrive/My\ Drive/FOLDER

model_save_name = 'asba_classifier.pt'
path = F"/content/gdrive/MyDrive/{model_save_name}"
#path = F"/Shared drives/CRM & Insight/Analysis/arun/Text_Analytics"
torch.save(model.state_dict(), path)

Mounted at /content/gdrive


### Saving model checkpoint

In [28]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

#saving the model 
from datetime import datetime
REGION = 'europe-west2'
EXPERIMENT = '01'
SERIES = '01'

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET = PROJECT_ID
URI = f"gs://{BUCKET}/{SERIES}/{EXPERIMENT}"
DIR = f"temp/{EXPERIMENT}"
BLOB = f"{SERIES}/{EXPERIMENT}/models/{TIMESTAMP}/model/absa_classifier.pt"

FRAMEWORK = 'pytorch'
TASK = 'classification'
MODEL_TYPE = 'deberta'
EXPERIMENT_NAME = f'experiment-{SERIES}-{EXPERIMENT}-{FRAMEWORK}-{TASK}-{MODEL_TYPE}'
RUN_NAME = f'run-{TIMESTAMP}'

In [30]:
#Required packages
from google.cloud import aiplatform
from google.cloud import storage
import json

from datetime import datetime
import os

from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value

In [31]:
aiplatform.init(project=PROJECT_ID, location=REGION)
!rm -rf {DIR}
!mkdir -p {DIR}

### Initialising experiment

In [32]:
aiplatform.init(experiment = EXPERIMENT_NAME)

In [34]:
expRun = aiplatform.ExperimentRun.create(run_name = RUN_NAME, experiment = EXPERIMENT_NAME)
#log parameters to the experiment run:
expRun.log_params({'experiment': EXPERIMENT, 'series': SERIES, 'project_id': PROJECT_ID})

Associating projects/240414127532/locations/europe-west2/metadataStores/default/contexts/experiment-01-01-pytorch-classification-deberta-run-20231101142133 to Experiment: experiment-01-01-pytorch-classification-deberta


In [35]:
expRun.log_metrics({'test_accuracy': accuracy['accuracy']})

### Saving model

In [37]:
model_save_name = 'absa_classifier.pt'
path = F"{model_save_name}"
torch.save(model.state_dict(), path)

In [38]:
# Upload the model to GCS
bucket = storage.Client().bucket('absa-classification')
blob = bucket.blob(BLOB)
blob.upload_from_filename('absa_classifier.pt')

In [39]:
!gsutil ls gs://absa-classification/01/01/models/20231101142133/model


gs://absa-classification/01/01/models/20231101142133/model/absa_classifier.pt


In [40]:
#logging where the model has been saved
expRun.log_params({'model.save': r'gs://sentiment_response/01/01/models/20231024142940/model/sentiment_classifier.pt'})

In [41]:
modelmatch = aiplatform.Model.list(filter = f'display_name={SERIES}_{EXPERIMENT} AND labels.series={SERIES} AND labels.experiment={EXPERIMENT}')

upload_model = True
if modelmatch:
    print("Model Already in Registry:")
    if RUN_NAME in modelmatch[0].version_aliases:
        print("This version already loaded, no action taken.")
        upload_model = False
        model = aiplatform.Model(model_name = modelmatch[0].resource_name)
    else:
        print('Loading model as new default version.')
        parent_model = modelmatch[0].resource_name

else:
    print('This is a new model, creating in model registry')
    parent_model = ''

if upload_model:
    model = aiplatform.Model.upload(
        display_name = f'{SERIES}_{EXPERIMENT}',
        model_id = f'model_{SERIES}_{EXPERIMENT}',
        parent_model =  parent_model,
        serving_container_image_uri = 'europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13:latest',
        artifact_uri =r'gs://absa-classification/01/01/models/20231101142133/model',
        is_default_version = True,
        version_aliases = [RUN_NAME],
        version_description = RUN_NAME,
        labels = {'series' : f'{SERIES}', 'experiment' : f'{EXPERIMENT}', 'experiment_name' : f'{EXPERIMENT_NAME}', 'run_name' : f'{RUN_NAME}'}        
    )

Model Already in Registry:
Loading model as new default version.
Creating Model
Create Model backing LRO: projects/240414127532/locations/europe-west2/models/model_01_01/operations/8301443631592505344
Model created. Resource name: projects/240414127532/locations/europe-west2/models/model_01_01@2
To use this Model in another session:
model = aiplatform.Model('projects/240414127532/locations/europe-west2/models/model_01_01@2')


In [42]:
print(f'Review the model in the Vertex AI Model Registry:\nhttps://console.cloud.google.com/vertex-ai/locations/{REGION}/models/{model.name}?project={PROJECT_ID}')

Review the model in the Vertex AI Model Registry:
https://console.cloud.google.com/vertex-ai/locations/europe-west2/models/model_01_01?project=surveys-402414


In [45]:
#update model descriptions
expRun.log_params({
    'model.uri': model.uri,
    'model.display_name': model.display_name,
    'model.name': model.name,
    'model.resource_name': model.resource_name,
    'model.version_id': model.version_id,
    'model.versioned_resource_name': model.versioned_resource_name
})

In [46]:
#complete experiment run
expRun.update_state(state = aiplatform.gapic.Execution.State.COMPLETE)

In [47]:
exp = aiplatform.Experiment(experiment_name = EXPERIMENT_NAME)

In [48]:
exp.get_data_frame()

Unnamed: 0,experiment_name,run_name,run_type,state,param.series,param.project_id,param.model.uri,param.model.resource_name,param.model.display_name,param.experiment,param.model.save,param.model.versioned_resource_name,param.model.name,param.model.version_id,metric.test_accuracy
0,experiment-01-01-pytorch-classification-deberta,run-20231101142133,system.ExperimentRun,COMPLETE,1,surveys-402414,gs://absa-classification/01/01/models/20231101...,projects/240414127532/locations/europe-west2/m...,01_01,1,gs://sentiment_response/01/01/models/202310241...,projects/240414127532/locations/europe-west2/m...,model_01_01,2,0.759036


**Predicting term and response absa sentiment**

Going from pandas df with responses and terms, converting data types to get model outputs and then putting it back into the dataframe.

In [None]:
#preprocessing to pass into bert
df = pd.read_csv('emotion_append.csv')
df = df.dropna(subset = ['response','Term'])

df['nps'] = df['nps'].replace('10 (Extremely likely)',10)
df['nps'] = df['nps'].replace('0 (Not at all likely)',0)
df['nps'] = df['nps'].astype(int)

#target variable will nps split into demoters, passives and promoters
df['label'] = np.where(df['nps'] >= 9,2,
                  np.where(df['nps'] <= 6,0,1))

df['response'] = df['response'].apply(lambda x: text_preprocessing(x))
df['Term'] = df['Term'].apply(lambda x: text_preprocessing(x))

In [None]:
prob_df = df[['response','Term', 'label']]

ds = Dataset.from_pandas(prob_df)

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "yangheng/deberta-v3-large-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["response"], example["Term"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




Map:   0%|          | 0/3359 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
#detokenizing inputs
# token_output = []
# for tok in tokenized_datasets['input_ids']:
#   tokens = []
#   detokenized = tokenizer.decode(tok)
#   token_output.append(detokenized)

In [None]:
#tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term"])
tokenized_datasets = tokenized_datasets.remove_columns(["response", "Term", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")


In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    tokenized_datasets, shuffle=True, batch_size=8, collate_fn=data_collator
)
for batch in  dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 52]),
 'token_type_ids': torch.Size([8, 52]),
 'attention_mask': torch.Size([8, 52])}

In [None]:
# import torch

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
# device

In [None]:
all_logits = []
input_ids = []
for batch in dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    input_ids.append(batch['input_ids'])
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    all_logits.append(logits)

all_logits = torch.cat(all_logits, dim=0)
probs = torch.argmax(all_logits, dim=1).cpu().numpy()
probs

array([2, 0, 0, ..., 0, 2, 1])

In [None]:
token_output = []
for tok in tokenized_datasets['input_ids']:
  tokens = []
  detokenized = tokenizer.decode(tok)
  token_output.append(detokenized)

In [None]:
token_output = []
for batch in input_ids:
  for id in batch:
      token_output.append(tokenizer.decode(id))

In [None]:
token_output
check = pd.DataFrame(list(zip(token_output)), columns = ['detokenized'])
check.to_csv('check.csv')

In [None]:
df['aspect_sentiment'] = probs
df.to_csv('asba_append.csv')

In [None]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ticket_number,response,nps,csat,Term,Tier_2,Tier_1,entities,entity_type,entity_start,entity_end,num,emotion,label,aspect_sentiment
0,0,0,CSA03746,i think the driver was kevin. lgw to lhr. an a...,10,Very satisfied,driver,Staff,People,"{'entity': 'B-PER', 'score': 0.99710387, 'inde...",B-PER,23,28,1,neutral,2,2
1,1,16195,EUAJWU72,the trip from portsmouth to london was affecte...,7,Somewhat dissatisfied,delayed,Time,Service,"{'entity': 'B-PER', 'score': 0.9990357, 'index...",B-PER,125,132,1,joy,1,0
2,2,7926,EUADWE24,left on time and the driver (tom) was very pol...,10,Very satisfied,time,Time,Service,"{'entity': 'B-PER', 'score': 0.9992095, 'index...",B-PER,29,32,1,neutral,2,0
3,3,13909,EUAHLT03,exceptionally friendly & humorous drivers (not...,10,Very satisfied,journey,Journey,Service,"{'entity': 'B-PER', 'score': 0.9753795, 'index...",B-PER,79,81,1,joy,2,1
4,4,16196,EUAJWU72,the trip from portsmouth to london was affecte...,7,Somewhat dissatisfied,driver,Staff,People,"{'entity': 'B-PER', 'score': 0.9990357, 'index...",B-PER,125,132,1,joy,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,3639,6424,EUADAR88,"arrived at liverpool,the coach was delayed so ...",0,Very dissatisfied,arrived,Journey,Service,"{'entity': 'B-LOC', 'score': 0.8416962, 'index...",B-LOC,11,20,2,sadness,0,0
3640,3640,6423,EUADAR88,"arrived at liverpool,the coach was delayed so ...",0,Very dissatisfied,delayed,Time,Service,"{'entity': 'B-LOC', 'score': 0.8416962, 'index...",B-LOC,11,20,2,sadness,0,2
3641,3641,6422,EUADAR74,coach on time at heathrow and arrived on time ...,8,Very satisfied,arrived,Journey,Service,"{'entity': 'I-LOC', 'score': 0.9968671, 'index...",I-LOC,109,111,2,sadness,1,0
3642,3642,6481,EUADBM85,to get to london it took us 6 hours. then comi...,1,Very dissatisfied,hour,Time,Service,"{'entity': 'B-LOC', 'score': 0.99972266, 'inde...",B-LOC,10,16,2,surprise,0,2
