In [None]:
import pandas as pd
import numpy as np
import sys

In [None]:
task = 'preferred'

In [None]:
df = pd.read_csv('../data/labeled_data/CPC_labeled_proc_dataset.csv')

In [None]:
y_true = df[task].tolist()
input_text = df['proc_sent'].tolist()

X = np.array(input_text)
y = np.array(y_true)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

def get_train_test_val_split(df):
    
    y_true = df[task].tolist()
    input_text = df['proc_sent'].tolist()

    X = np.array(input_text)
    y = np.array(y_true)
    
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.40, random_state=42)
    for train_index, test_valid_index in split.split(X, y):
        train_set = df.iloc[train_index]
        test_valid_set = df.iloc[test_valid_index]
        
        np.save('../final_data/pref/train_index.npy', train_index)

    split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
    for test_index, valid_index in split2.split(test_valid_set, test_valid_set[task]):
        test_set = test_valid_set.iloc[test_index]
        valid_set = test_valid_set.iloc[valid_index]
        
        np.save('../final_data/pref/test_index.npy', test_index)
        np.save('../final_data/pref/val_index.npy', valid_index)
        
    return train_set, valid_set, test_set
        
def get_data(task):
    
    if task == 'csi':
        train_index = np.load('../final_data/comp/train_index.npy')
        test_index = np.load('../final_data/comp/test_index.npy')
        val_index = np.load('../final_data/comp/val_index.npy')
    else:
        train_index = np.load('../final_data/pref/train_index.npy')
        test_index = np.load('../final_data/pref/test_index.npy')
        val_index = np.load('../final_data/pref/val_index.npy')
        
    train_set = df.iloc[train_index]
    val_set = df.iloc[val_index]
    test_set = df.iloc[test_index]
    
    return train_set, val_set, test_set
    
train_set, val_set, test_set  = get_data('cpc')

In [None]:
train_set.shape, val_set.shape, test_set.shape

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=75)
val_encodings = tokenizer(val_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=75)
test_encodings = tokenizer(test_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=75)

In [None]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_set[task].tolist())
val_dataset = IMDbDataset(val_encodings, val_set[task].tolist())
test_dataset = IMDbDataset(test_encodings, test_set[task].tolist())

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=50,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 3)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset          # evaluation dataset
)

trainer.train()

In [None]:
#trainer.evaluate()
trainer.save_model("models/distilbert_loader_pref")

In [None]:
#config = .from_pretrained("./wandb/run-20210928_120505-2kfw8md3")

In [None]:
import torch
from transformers import DistilBertForTokenClassification

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

# from transformers import DistilBertTokenizerFast
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

#model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')"
model = AutoModel.from_pretrained("models/distilbert_loader_pref")
model.to(device)
print("Model loaded")

In [None]:
outputs = trainer.predict(test_dataset)
y_pred = outputs.predictions.argmax(1)
y_test = test_dataset.labels

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
def get_wrong_labeled_sent():
    
    temp_df = pd.DataFrame()
    temp_df['y_pred'] = y_pred
    temp_df['y_true'] = y_test
    
    temp_df = temp_df.assign(same_flag = lambda x: (x['y_pred'] == x['y_true']))
    fil_index = temp_df[temp_df['same_flag'] == False].index
    
    temp = test_set.iloc[fil_index]
    temp_df = temp_df.iloc[fil_index]
    temp['y_true'] = temp_df[temp_df['same_flag'] == False]['y_true']
    temp['y_pred'] = temp_df[temp_df['same_flag'] == False]['y_pred']
    
    return temp

#wrong_preds = get_wrong_labeled_sent()
#wrong_preds.shape
#wrong_preds.to_csv('results/wrong_pred_pref.csv')