In [29]:
import pandas as pd
import numpy as np
import sys

pd.set_option("display.max_colwidth", None)

In [30]:
seg_flag = True
att_flag = False

In [31]:
# d = 'exp'
train_set = pd.read_csv('../data/train.csv')
test_set = pd.read_csv('../data/test.csv')
val_set = pd.read_csv('../data/val.csv')

train_set['preferred'].replace({'T':0, 'N':1,'O':2}, inplace=True)
val_set['preferred'].replace({'T':0, 'N':1,'O':2}, inplace=True)
test_set['preferred'].replace({'T':0, 'N':1,'O':2}, inplace=True)

train_set['comparison'] = train_set['comparison'].astype(int)
val_set['comparison'] = val_set['comparison'].astype(int)
test_set['comparison'] = test_set['comparison'].astype(int)

In [22]:
# train_set.proc_sent

In [6]:
# pixie = pd.read_csv('../final_data/Pixie.csv')
# pixie = pixie[['Unnamed: 0', 'appId', 'mentions', 'mentioned_apps', 'current_app', 'sentence', 'masked_sent', 'comparison', 'preferred']]

In [7]:
# test_set['preferred'].value_counts()

In [8]:
# pixie = pixie.rename(columns = {'Unnamed: 0.1.1': 'sentId'})
# pixie = pixie.rename(columns = {'proc_sent': 'masked_sent'})

In [9]:
# pixie.to_csv('../final_data/Pixie.csv')

### Upsampling

In [10]:
class1_test = train_set[train_set['preferred'] == 0].sample(2400, replace = False)
class2_test = train_set[train_set['preferred'] == 1].sample(2400, replace = True)
class3_test = train_set[train_set['preferred'] == 2].sample(2400, replace = True)

train_set = pd.concat((class1_test, class2_test, class3_test), axis = 0)
train_set = train_set.sample(7200, random_state=42)

class1_val = val_set[val_set['preferred'] == 0].sample(800, replace = False)
class2_val = val_set[val_set['preferred'] == 1].sample(800, replace = True)
class3_val = val_set[val_set['preferred'] == 2].sample(800, replace = True)

val_set = pd.concat((class1_val, class2_val, class3_val), axis = 0)
val_set = val_set.sample(2400, random_state=42)

class1_test = test_set[test_set['preferred'] == 0].sample(800, replace = False)
class2_test = test_set[test_set['preferred'] == 1].sample(800, replace = True)
class3_test = test_set[test_set['preferred'] == 2].sample(800, replace = True)

test_set = pd.concat((class1_test, class2_test, class3_test), axis = 0)
test_set = test_set.sample(2400, random_state=42)

### Methods to create segment embeddings

In [32]:
import torch

class getDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
def update_seg_embeddings(tokenizer, encodings):
    
    
    input_ids = encodings['input_ids']
    seg_embeddings = encodings['token_type_ids']
    
    new_tok_id = tokenizer.vocab_size + 1
    new_seg_embs = []
    
    for inp_ids, seg_emb in zip(input_ids, seg_embeddings):
        
        seg_embs = [0 if(x == new_tok_id or x == 0) else 1 for x in inp_ids]
#         seg_embs = [2 if(x==new_tok_id) else x for x in inp_ids]
#         seg_embs = [1 if(x!=0 and x!=2) else x for x in seg_embs]
        new_seg_embs.append(seg_embs)
        
    encodings['token_type_ids'] = new_seg_embs
    
    return encodings
#train_encodings_ = update_seg_embeddings(tokenizer, train_encodings)


def update_attention_masks(tokenizer, encodings):
    
    
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    new_tok_id = tokenizer.vocab_size + 1
    new_att_mask = []
    
    for inp_ids, att_m in zip(input_ids, attention_mask):
        
        #att_mask = [1 if(x == new_tok_id or x == 0) else 1 for x in inp_ids]
        att_mask = [0.50 if(x==new_tok_id or x==new_tok_id-1) else x for x in inp_ids]
        att_mask = [1 if(x!=0.50 and x!=0) else x for x in att_mask]
        new_att_mask.append(att_mask)
        
    encodings['attention_mask'] = new_att_mask
    
    return encodings

#train_encodings_ = update_attention_masks(tokenizer, train_encodings)

#### Change flags here for appropriate settings

In [39]:
model_list = ['bert-base-cased', 'xlnet-base-cased', 'albert-base-v2', 'microsoft/deberta-base']   
cmodel = model_list[0]     # Chose a model
save = False   # Change to True if you wish to save the model after training
training_flag = True   ### Change to False to use the saved model (you will have to train the model once and save it, or the code will complain saying 'no model found')

if(seg_flag == True):
    filename = "models/" + str(cmodel) + '_seg_emb'
else:
    filename = "models/" + str(cmodel)

In [40]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(cmodel, num_labels = 3)
tokenizer = AutoTokenizer.from_pretrained(cmodel)  
tokenizer.add_tokens(['<current_entity>', '<other_entity>'])
model.resize_token_embeddings(len(tokenizer))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(28998, 768)

In [41]:
# print(tokenizer.tokenize(train_set['proc_sent'][0]))
# encoding = tokenizer([train_set['proc_sent'][0]], truncation=True, padding='max_length', max_length=100)
# encoding.keys()

### Encode tain and validation dataset

In [42]:
train_encodings = tokenizer(train_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=100)
val_encodings = tokenizer(val_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=100)   

if(seg_flag == True):
    train_encodings = update_seg_embeddings(tokenizer, train_encodings)
    val_encodings = update_seg_embeddings(tokenizer, val_encodings)
    
if(att_flag == True):
    train_encodings = update_attention_masks(tokenizer, train_encodings)
    val_encodings = update_attention_masks(tokenizer, val_encodings)

train_dataset = getDataset(train_encodings, train_set[task].tolist())
val_dataset = getDataset(val_encodings, val_set[task].tolist())

In [43]:
# print(tokenizer.tokenize(train_set['proc_sent'][0]))

### Training a model

In [44]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
    report_to="wandb"
)

# training_args = TrainingArguments("test-trainer")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
)

# trainer = Trainer(
#     model,
#     args,
#     train_dataset=encoded_dataset["train"],
#     eval_dataset=encoded_dataset[validation_key],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

if(training_flag == True):
    trainer.train()



Step,Training Loss
1000,0.44
2000,0.0325
3000,0.0037




### Save the trained model

In [25]:
#trainer.evaluate()

if(save == True):
    filename = "models/" + str(cmodel) + '_upsampling'
    trainer.save_model(filename)

In [26]:
# import torch
# from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer

if(training_flag == False):
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = AutoModelForSequenceClassification.from_pretrained(filename, num_labels = 3)
    model.to(device)
    print("Model loaded")

    trainer = Trainer(model=model)

Model loaded


In [45]:
test_encodings = tokenizer(test_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=100)

if(seg_flag == True):
    test_encodings = update_seg_embeddings(tokenizer, test_encodings)
    
if(att_flag == True):
    test_encodings = update_attention_masks(tokenizer, test_encodings)
    
test_dataset = getDataset(test_encodings, test_set[task].tolist())

outputs = trainer.predict(test_dataset)
y_pred = outputs.predictions.argmax(1)
y_test = test_dataset.labels

from sklearn.metrics import classification_report, confusion_matrix, f1_score
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
print(round(f1_score(y_test, y_pred, average='micro'),4))

              precision    recall  f1-score   support

           0     0.8317    0.8504    0.8409       802
           1     0.6174    0.5046    0.5554       323
           2     0.7680    0.8162    0.7914       653

    accuracy                         0.7750      1778
   macro avg     0.7390    0.7238    0.7292      1778
weighted avg     0.7694    0.7750    0.7709      1778

[[682  46  74]
 [ 73 163  87]
 [ 65  55 533]]
0.775


In [None]:
#train_dataset[0]

In [None]:
test_set[cmodel] = y_pred

In [None]:
test_set = test_set.rename(columns={'microsoft/deberta-base':'deberta', 'albert-base-v2': 'albert', 'xlnet-base-cased': 'xlnet', 'bert-base-cased': 'bert'})

In [None]:
predictions = pd.read_csv('results/ed-gat_pred.csv')

In [None]:
predictions = predictions.rename(columns = {'Unnamed: 0.1':'id'})

In [None]:
test_set = test_set.rename(columns = {'Unnamed: 0':'id'})

In [None]:
final_pred = test_set.merge(predictions, on = 'id')

In [None]:
final_pred = final_pred[['id', 'sentId_x', 'appId_x', 'mentions_x', 'mentioned_apps_x', 'current_app_x', 'comparison_x', 'preferred_x', 'sentence_x', 'proc_sent_x', 'deberta_seg', 
            'albert_seg', 'xlnet_seg', 'bert_seg', 'deberta', 'albert', 'xlnet', 'bert', 'pred_labs_edgat']]

In [None]:
final_pred.columns = final_pred.columns.str.replace('_x', '')

In [None]:
final_pred.to_csv('results/combined_predictions.csv')

In [None]:
filename

### Wrong Predictions

In [None]:
def get_wrong_labeled_sent(test_set, y_pred, y_test):
    
    temp_df = pd.DataFrame()
    temp_df['y_pred'] = y_pred
    temp_df['y_true'] = y_test
    
    temp_df = temp_df.assign(same_flag = lambda x: (x['y_pred'] == x['y_true']))
    fil_index = temp_df[temp_df['same_flag'] == False].index
    
    temp = test_set.iloc[fil_index]
    temp_df = temp_df.iloc[fil_index]
    temp['y_true'] = temp_df[temp_df['same_flag'] == False]['y_true']
    temp['y_pred'] = temp_df[temp_df['same_flag'] == False]['y_pred']
    
    return temp

wrong_preds = get_wrong_labeled_sent(train_set, y_pred, y_test)
wrong_preds.shape