In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
import pandas as pd
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_metric
from typing import Optional
import re
from dataclasses import dataclass
from collections import Counter
import os
import warnings
warnings.filterwarnings("ignore")
# from tqdm import tqdm
from tqdm.notebook import tqdm
import random

In [None]:

seed_val = 1234
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# os.environ['CUDA_VISIBLE_DEVICES'] = '1,2'
# os.environ['TRANSFORMERS_CACHE'] = '/data/users/kartik/hfcache/'
# os.environ['HF_HOME'] = '/data/users/kartik/hfcache/'
os.environ["WANDB_DISABLED"] = "true"



In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/MyDrive/backup/GSR

Mounted at /gdrive
/gdrive/MyDrive/backup/GSR


In [None]:
# cols_used = ['da','gen_text','ref','genre']
cols_used = ['da','ref','gen_text','genre']


In [24]:
genres  = ['hotel','restaurant','laptop','tv']
df = pd.read_json(f'./RNNLG/{genres[0]}/train.json', orient='records')
df['genre'] = genres[0]
print ('genre:',genres[0],len(df))

for genre in genres[1:]:
    _df = pd.read_json(f'./RNNLG/{genre}/train.json', orient='records')
    _df['genre'] = genre
    df = pd.concat([df,_df])
    print ('genre: ',genre, len(_df),len(df))

df.columns = cols_used
df = pd.concat([df[['da','ref','genre']],df[['da','gen_text','genre']].rename(columns={'gen_text':'ref'})]) #gold 1 and 2 combined
df.head(2)


genre: hotel 3223
genre:  restaurant 3114 6337
genre:  laptop 7944 14281
genre:  tv 4221 18502


Unnamed: 0,da,ref,genre
0,inform_no_match(acceptscreditcards='no';pricer...,there are no pricey hotel -s that do not accep...,hotel
1,?confirm(pricerange='pricey'),do you want a pricey hotel,hotel


In [25]:
val_df = pd.read_json(f'./RNNLG/{genres[0]}/valid.json', orient='records')
val_df['genre'] = genres[0]
print ('genre:',genres[0],len(val_df))

for genre in genres[1:]:
    _df = pd.read_json(f'./RNNLG/{genre}/valid.json', orient='records')
    _df['genre'] = genre
    val_df = pd.concat([val_df,_df])
    print ('genre: ',genre, len(_df),len(val_df))

val_df.columns = cols_used
val_df = pd.concat([val_df[['da','ref','genre']],val_df[['da','gen_text','genre']].rename(columns={'gen_text':'ref'})])
print (len(val_df))
val_df.head(2)


genre: hotel 1075
genre:  restaurant 1039 2114
genre:  laptop 2649 4763
genre:  tv 1407 6170
12340


Unnamed: 0,da,ref,genre
0,goodbye(),goodbye ! i apologize if i wasn't of use,hotel
1,inform(name='grand hyatt san francisco';dogsal...,grand hyatt san francisco allows dogs,hotel


In [26]:
df['da'] = df['da'].apply(lambda x: x.split('(')[0])
val_df['da'] = val_df['da'].apply(lambda x: x.split('(')[0])
df['da'].value_counts()

inform               15172
recommend             9140
inform_count          4178
goodbye               2206
?request              1528
inform_no_match       1448
inform_only_match      730
?select                728
?confirm               690
?compare               590
?reqmore               228
inform_no_info         196
inform_all             128
suggest                 34
bye                      8
Name: da, dtype: int64

In [30]:
#request, select, goodbye remove
#collapse inform and recommend
def postproc_DAs(df):
  df = df[~df['da'].isin(('?request','?select','goodbye','bye'))]
  df['da'] = df['da'].apply(lambda x:'describe' if x in set(['inform','recommend']) else x)
  return df
df = postproc_DAs(df)
val_df = postproc_DAs(val_df)
df['da'].value_counts()

describe             24312
inform_count          4178
inform_no_match       1448
inform_only_match      730
?confirm               690
?compare               590
?reqmore               228
inform_no_info         196
inform_all             128
suggest                 34
Name: da, dtype: int64

In [31]:

idx_to_cr = list(set(df['da'].astype('str').tolist()))
label_dict = {val:idx for idx,val in enumerate(idx_to_cr)}
label_dict_inverse = {idx:val for idx,val in enumerate(idx_to_cr)}

print (df['da'].value_counts())

df['da'] = df['da'].astype('str').apply(lambda x:label_dict[x]) # convert labels to integers

# df = df.drop(columns = ['gen_text'])

describe             24312
inform_count          4178
inform_no_match       1448
inform_only_match      730
?confirm               690
?compare               590
?reqmore               228
inform_no_info         196
inform_all             128
suggest                 34
Name: da, dtype: int64


In [33]:
print (idx_to_cr)

['inform_no_match', 'describe', '?confirm', 'inform_only_match', 'inform_no_info', 'suggest', 'inform_all', '?compare', '?reqmore', 'inform_count']


In [32]:
val_df['da'] = val_df['da'].astype('str').apply(lambda x:label_dict[x])
# val_df = val_df.drop(columns = ['gen_text'])
val_df.head(2)

Unnamed: 0,da,ref,genre
1,1,grand hyatt san francisco allows dogs,hotel
2,1,"grand hyatt san francisco is pricey , the phon...",hotel


In [34]:
train_df = df
print ('Train dataset length: ',len(train_df))
print ('Valid dataset length: ',len(val_df))

from torch.utils.data import TensorDataset

model_checkpoint = ['xlm-roberta-base', 'distilbert-base-uncased','bert-base-uncased','albert-base-v2'][2]


tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# df['ref'].apply(lambda x:len(tokenizer.encode(x))).describe()


encoded_data_train = tokenizer.batch_encode_plus(
    train_df.ref.tolist(), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=50, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val_df.ref.tolist(), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=50, 
    return_tensors='pt'
)


Train dataset length:  32534
Valid dataset length:  10876


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [35]:

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df.da.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val_df.da.values)   

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

print ('len(dataset_train), len(dataset_val)',len(dataset_train), len(dataset_val))

len(dataset_train), len(dataset_val) 32534 10876


In [36]:

num_class = len(idx_to_cr)
print (f'Num of classes: {num_class}')

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                      num_labels= num_class,
                                                      output_attentions=False,
                                                      output_hidden_states=False)


Num of classes: 10


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [37]:

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)


from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)                                   
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)                  
from sklearn.metrics import f1_score, accuracy_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat),f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')      

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)                                            

cuda


In [38]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [39]:
epochs=2
best_val_loss,best_epoch = float('inf'),1
patience, max_patience = 0,3
for epoch in tqdm(range(1, epochs+1)):
# for epoch in tqdm(range(1, 2)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    print (f'Saving checkpoints/da_classify_alldomain/{model_checkpoint}/epoch_{epoch}.model')
    torch.save(model.state_dict(), f'checkpoints/da_classify_alldomain/{model_checkpoint}/epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    train_loss, predictions, true_vals = evaluate(dataloader_train)
    train_acc, train_f1 = f1_score_func(predictions, true_vals)

    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Training Accuracy: {train_acc}')
    tqdm.write(f'Training F1: {train_f1}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_acc, val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_acc}')
    tqdm.write(f'Val F1: {val_f1}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    if val_loss < best_val_loss:
        patience = 0
        best_epoch = epoch
        best_val_loss = val_loss
    else:
        patience +=1

    print ('Patience',patience,'best_loss',best_val_loss,'val loss',val_loss)

    if patience == max_patience:
        break



  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/2034 [00:00<?, ?it/s]

Saving checkpoints/da_classify_alldomain/bert-base-uncased/epoch_1.model

Epoch 1
Training loss: 0.10936415517644778
Training Accuracy: 0.9966803958935267
Training F1: 0.9966278640567074
Validation loss: 0.027496483366720736
F1 Score (Weighted): 0.9952188304523722
Val F1: 0.9952192936762231
Patience 0 best_loss 0.027496483366720736 val loss 0.027496483366720736


Epoch 2:   0%|          | 0/2034 [00:00<?, ?it/s]

Saving checkpoints/da_classify_alldomain/bert-base-uncased/epoch_2.model

Epoch 2
Training loss: 0.01823617688410173
Training Accuracy: 0.9976639822954447
Training F1: 0.9976228301236346
Validation loss: 0.02733254118489533
F1 Score (Weighted): 0.9954027215888194
Val F1: 0.9953751687562387
Patience 0 best_loss 0.02733254118489533 val loss 0.02733254118489533


In [None]:
best_epoch  = 2
print (best_epoch)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                      num_labels=num_class,
                                                      output_attentions=False,
                                                      output_hidden_states=False)


print (f'Loading checkpoints/da_classify_alldomain/{model_checkpoint}/epoch_{best_epoch}.model')
model.load_state_dict(torch.load(f'checkpoints/da_classify_alldomain/{model_checkpoint}/epoch_{best_epoch}.model'))
model.eval()
model.to(device)
print()

2


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Loading checkpoints/da_classify_alldomain/bert-base-uncased/epoch_2.model



In [40]:
model.eval()
print (2)

2


In [65]:
genre  = ['restaurant','hotel','laptop','tv'][2]
print (genre)
test_df = pd.read_json(f'./RNNLG/{genre}/test.json', orient='records')
test_df['genre'] = genre
test_df.columns = cols_used
test_df.head(2)

test_df['da'] = test_df['da'].apply(lambda x: x.split('(')[0])
test_df = postproc_DAs(test_df)
print (test_df['da'].value_counts())

# test_df = test_df[test_df['da'].isin(label_dict)]
test_df['da'] = test_df['da'].astype('str').apply(lambda x:label_dict[x])
test_df = test_df.drop(columns = ['gen_text'])
test_df.head(2)

laptop
describe             2133
inform_count          271
?compare               64
inform_only_match      59
inform_no_match        46
?confirm               34
inform_no_info         24
inform_all              8
suggest                 1
Name: da, dtype: int64


Unnamed: 0,da,ref,genre
0,6,based on the criteria all laptop -s with a sta...,laptop
1,1,the satellite zelus 40 is a sleek laptop with ...,laptop


In [66]:
print (genre)
encoded_data_test = tokenizer.batch_encode_plus(
    test_df.ref.tolist(), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=50, 
    return_tensors='pt'
)


input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test_df.da.values)

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)


dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

_, predictions, true_vals = evaluate(dataloader_test)

preds = [label_dict_inverse[i] for i in np.argmax(predictions, axis=1).flatten()]
true_vals = [label_dict_inverse[i] for i in true_vals]
# preds = [i if i!='nan' else 'none' for i in preds ]

pd.DataFrame(zip(range(len(test_df)),test_df['ref'],true_vals,preds),columns=['Id','ref','true','predicted']).to_csv(f'preds/bert_da_classify_{genre}.csv',index=None)

laptop


In [67]:
from sklearn.metrics import classification_report
print (classification_report(true_vals, preds))

                   precision    recall  f1-score   support

         ?compare       1.00      1.00      1.00        64
         ?confirm       0.92      1.00      0.96        34
         describe       0.99      1.00      1.00      2133
       inform_all       1.00      0.75      0.86         8
     inform_count       1.00      1.00      1.00       271
   inform_no_info       1.00      1.00      1.00        24
  inform_no_match       1.00      0.96      0.98        46
inform_only_match       0.98      0.73      0.83        59
          suggest       1.00      1.00      1.00         1

         accuracy                           0.99      2640
        macro avg       0.99      0.94      0.96      2640
     weighted avg       0.99      0.99      0.99      2640



In [None]:
# df[df['da']==7]['ref'].values.tolist()
# label_dict

In [47]:
from sklearn.metrics import confusion_matrix

confusion_matrix(true_vals,preds)

array([[ 27,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,  26,   0,   0,   0,   0,   1,   0,   0],
       [  0,   0, 971,   0,   0,   0,   0,   2,   0],
       [  0,   0,   0,  11,   0,   0,   1,   0,   0],
       [  0,   0,   0,   0, 265,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,  11,   0,   0,   0],
       [  0,   2,   0,   1,   0,   0,  45,   0,   0],
       [  0,   0,   7,   0,   0,   0,   0,  31,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   1]])

In [None]:
label_dict

{'suggest': 0,
 'bye': 1,
 'inform': 2,
 'inform_count': 3,
 '?request': 4,
 '?compare': 5,
 'inform_all': 6,
 'recommend': 7,
 'inform_no_info': 8,
 '?reqmore': 9,
 '?confirm': 10,
 'inform_only_match': 11,
 '?select': 12,
 'goodbye': 13,
 'inform_no_match': 14}

In [None]:
labels_present = [i for i in idx_to_cr if (i in true_vals or i in preds)]
# labels_present

## 2023 Exp

In [61]:
def process_labels(x):
  x = ('_').join(x.split())
  if x in ('inform','recommend'):
    return x
  elif x in ('confirm','request','compare','select','reqmore'):
    x = '?'+x
  return x

def inverse_process_labels(x):
  x = (' ').join(x.split('_'))
  x = x.replace('?','')
  return x

# filename = 'rnnlg_laptop_generated_wSACC_runDA'
filename = 'rnnlg_laptop_generated_new'

test_df = pd.read_csv(f'./exp2023/{filename}.csv')
test_df = test_df[['new_da','text']]
test_df.rename(columns={'text':'ref','new_da':'da'},inplace=True)
test_df['da'] = test_df['da'].apply(lambda x:process_labels(x))
test_df = postproc_DAs(test_df)
test_df['da'] = test_df['da'].astype('str').apply(lambda x:label_dict[x])

test_df.head(2)

Unnamed: 0,da,ref
0,6,All satellite pro laptops have a standard batt...
1,6,All satellite pro laptops have a standard batt...


In [49]:
test_df['da'].value_counts()

describe             9730
inform_count         2650
inform_no_match       480
inform_only_match     380
?compare              270
?confirm              270
inform_all            120
inform_no_info        110
suggest                10
Name: da, dtype: int64

In [56]:
def softmax(x):
  e_x = np.exp(x)
  return e_x/e_x.sum(axis=1).reshape(-1,1)

In [62]:
encoded_data_test = tokenizer.batch_encode_plus(
    test_df.ref.tolist(), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=50, 
    return_tensors='pt'
)


input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test_df.da.values)

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)


dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

_, predictions, true_vals = evaluate(dataloader_test)

probs = np.max(softmax(predictions), axis=1).flatten()
preds = [label_dict_inverse[i] for i in np.argmax(predictions, axis=1).flatten()]
preds_proc = list(map(inverse_process_labels,preds))

true_vals = [label_dict_inverse[i] for i in true_vals]

# preds = [i if i!='nan' else 'none' for i in preds ]

new_df = pd.DataFrame(zip(range(len(test_df)),test_df['ref'],true_vals,preds_proc,probs),columns=['Id','ref','true','predicted','DA_prob'])
new_df.to_csv(f'preds/{filename}.csv',index=None)


In [63]:
new_df.head()

Unnamed: 0,Id,ref,true,predicted,DA_prob
0,0,All satellite pro laptops have a standard batt...,inform_all,inform all,0.985253
1,1,All satellite pro laptops have a standard batt...,inform_all,inform all,0.985253
2,2,Our satellite pro laptops have a battery life ...,inform_all,inform all,0.769558
3,3,All our satellite pro laptops have the standar...,inform_all,inform all,0.985211
4,4,All satellite pro laptops have a rating of sta...,inform_all,inform all,0.985412


In [64]:
from sklearn.metrics import classification_report
print (classification_report(true_vals, preds))

                   precision    recall  f1-score   support

         ?compare       0.97      1.00      0.98       640
         ?confirm       0.80      0.99      0.89       340
         describe       0.99      1.00      0.99     21330
       inform_all       0.81      0.82      0.82        80
     inform_count       1.00      0.97      0.98      2710
   inform_no_info       0.99      0.99      0.99       240
  inform_no_match       0.87      0.90      0.89       460
inform_only_match       0.99      0.59      0.74       590
          suggest       0.42      1.00      0.59        10

         accuracy                           0.98     26400
        macro avg       0.87      0.92      0.88     26400
     weighted avg       0.99      0.98      0.98     26400



In [None]:
# from sklearn.metrics import confusion_matrix
# from matplotlib import pyplot as plt
# import seaborn as sn

# plt.figure(figsize=(8,5))
# df_cm = pd.DataFrame(confusion_matrix(true_vals,preds,labels=labels_present),index = labels_present, columns=labels_present)
# sn.heatmap(df_cm, annot=True,cmap="Blues")
# plt.show()

In [None]:
# human_df = pd.read_csv('./RNNLG/restaurant/rnnlg_resturant_5_shuffle_output.csv')
# print (len(human_df))
# if 'da' not in human_df.columns:
#     human_df['da'] = human_df['relations'].astype('str').apply(lambda x: x.split()[0].strip())

# human_df = human_df[human_df['Is it related to DA (perfect 3, 2, 1 means not really) ']>2.0]
# print (len(human_df))
# human_df.dropna(subset=['da'])
# human_df = human_df[human_df['da'].isin(idx_to_cr)]
# print (len(human_df))
# human_df

In [None]:
# human_df['da'] = human_df['da'].astype('str').apply(lambda x:label_dict[x])

In [None]:
# genre = 'restaurant'
# encoded_data_test = tokenizer.batch_encode_plus(
#     human_df.generated.tolist(), 
#     add_special_tokens=True, 
#     return_attention_mask=True, 
#     pad_to_max_length=True, 
#     max_length=50, 
#     return_tensors='pt'
# )


# input_ids_test = encoded_data_test['input_ids']
# attention_masks_test = encoded_data_test['attention_mask']
# labels_test = torch.tensor(human_df.da.values)

# dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)


# dataloader_test = DataLoader(dataset_test, 
#                                    sampler=SequentialSampler(dataset_test), 
#                                    batch_size=batch_size)

# _, predictions, true_vals = evaluate(dataloader_test)

# preds = [label_dict_inverse[i] for i in np.argmax(predictions, axis=1).flatten()]
# true_vals = [label_dict_inverse[i] for i in true_vals]
# # preds = [i if i!='nan' else 'none' for i in preds ]

# pd.DataFrame(zip(range(len(human_df)),human_df['generated'],true_vals,preds),columns=['Id','ref','true','predicted']).to_csv(f'preds/bert_da_classify_human_{genre}.csv',index=None)

In [None]:
# from sklearn.metrics import classification_report
# print (classification_report(true_vals, preds))

In [None]:
# from sklearn.metrics import confusion_matrix
# from matplotlib import pyplot as plt
# import seaborn as sn
# labels_present = [i for i in idx_to_cr if (i in true_vals or i in preds)]

# plt.figure(figsize=(8,5))
# df_cm = pd.DataFrame(confusion_matrix(true_vals,preds,labels=labels_present),index = labels_present, columns=labels_present)
# sn.heatmap(df_cm, annot=True,cmap="Blues")
# plt.show()