In [None]:
import pandas as pd
import numpy as np
import codecs


In [None]:
df_synthetic = pd.read_csv("output_messages.csv", encoding='utf-16', index_col=0)
df_synthetic = df_synthetic.assign(target='YES')


In [None]:
def for_emoji(message):
    message = message.replace('\\n', '')
    message = codecs.decode(message, 'unicode_escape')
    message = message.encode('utf-16', 'surrogatepass').decode('utf-16')
    return message


In [None]:
df_original = pd.read_excel("/kaggle/input/clustering-sentences/prep_for_clustering.xlsx", usecols=['message'])
df_original['message'] = df_original['message'].apply(for_emoji)
df_original.columns = ['messages']
df_original = df_original.assign(target='NO')


In [None]:
df_all = pd.concat([df_original, df_synthetic])
df_all = df_all.sample(frac=1).reset_index(drop=True)
df_all = df_all.dropna()


In [None]:
train, test = np.split(df_all, [int(.9 * len(df_all))])


In [None]:
from transformers import GPT2Tokenizer


In [None]:
train_sentences = list(train['messages'].values)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
train_encodings = tokenizer(train_sentences, padding=True, truncation=True)


In [None]:
import torch
from torch.utils.data import DataLoader


In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)
   

In [None]:
def add_label(encodings):
    labels = (train['target'].values == "YES").astype(int)
    encodings.update({'label': labels})

add_label(train_encodings)


In [None]:
import sklearn.model_selection
from sklearn.metrics import classification_report, zero_one_loss


In [None]:
train_data = Dataset(train_encodings)
train_dataset, val_dataset = sklearn.model_selection.train_test_split(train_data)


In [None]:
from transformers import GPT2ForSequenceClassification, GPT2Config


In [None]:
model_config = GPT2Config.from_pretrained('gpt2', num_labels=2)
model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=model_config)


In [None]:
model.config.pad_token_id = model.config.eos_token_id

In [None]:
device = torch.device('cuda') 
model.to(device)


In [None]:
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = batch_size)


In [None]:
import matplotlib.pyplot as plt 
from transformers import AdamW


In [None]:
history_accuracy = []
history_train = []
history_eval = []

optim = AdamW(model.parameters(), lr=1e-6)

n_epochs = 5
k = 0
score = 0

for epoch in range(n_epochs):
    print('Эпоха : ', epoch)
    our_res = []
    true_res = []
    
    total_eval_loss = 0
    total_train_loss = 0
    
    model.train()
    step_train = 0
    print('train')
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        total_train_loss += outputs['loss'].detach().cpu().numpy()
        outputs['loss'].backward()
        optim.step()
        
        step_train += 1
        if step_train % 100 == 0:
            print("train_step : ", step_train / len(train_loader) * 100, '%')

    avg_train_loss = total_train_loss / len(train_loader)

    model.eval()
    step_eval = 0
    print('validation')
    for batch in val_loader:
        e_input_ids = batch['input_ids'].to(device)
        e_attention_mask = batch['attention_mask'].to(device)
        e_labels = batch['label'].to(device)

        with torch.no_grad():
            e_outputs = model(e_input_ids, token_type_ids=None,
                              attention_mask=e_attention_mask,
                              labels = e_labels)

        e_labels_logits = e_outputs['logits'].detach().cpu().numpy()
        
        for i in e_labels_logits:
            our_res.append(np.argmax(i))

        for i in range(len(e_labels)):
            true_res.append(e_labels[i].cpu().numpy())
            
        total_eval_loss += zero_one_loss(true_res, our_res)

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        step_eval += 1
        if (step_eval % 50 == 0):
            print("eval_step : ", step_eval / len(val_loader) * 100, "%")

    avg_eval_loss = total_eval_loss / len(val_loader)
    
    report = classification_report(true_res, our_res, output_dict=True)
    print(report)
    
    k += 1
    score += report['accuracy']
    avg_score = score / k
    
    print("Средняя оценка на валидации: ", avg_score)
    
    history_accuracy.append(avg_score)
    history_train.append(avg_train_loss)
    history_eval.append(avg_eval_loss)
    


In [None]:
plt.plot(figsize = (12,6))

plt.plot(history_accuracy, label="accuracy")
plt.plot(history_train, label="train loss")
plt.plot(history_eval, label="eval loss")

plt.title("loss and score")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.legend()

plt.show()


In [None]:
test_sentences = list(test['messages'].values)
true_test_res = (test['target'].values == "YES").astype(int)
test_encodings = tokenizer(test_sentences, padding=True, truncation=True)


In [None]:
test_dataset = Dataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size = batch_size)


In [None]:
model.eval()
print('test')
step_test = 0
predictions = []
test_res = []

for batch in test_loader:
    t_input_ids = batch['input_ids'].to(device)
    t_attention_mask = batch['attention_mask'].to(device)
    
    with torch.no_grad():        
        t_outputs = model(t_input_ids, token_type_ids=None, 
                          attention_mask=t_attention_mask)

    t_label_logits = t_outputs['logits'].detach().cpu().numpy()
    
    for i in t_label_logits:
        test_res.append(np.argmax(i))
            
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    step_test += 1
    if (step_test % 50 == 0):
        print("test_step : ", step_test / len(test_loader) * 100, "%")
        

report = classification_report(true_test_res, test_res, output_dict=True)
print(report)


In [None]:
model.save_pretrained("/working/")
