In [1]:
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from torch.optim import Adam, lr_scheduler
import torch.nn.functional as F

import os
import argparse
import json
import torch.nn as nn

from util import *
from losses import SupConLoss
from augment import *


In [2]:
class SupConRobertaNet(nn.Module):
    """backbone + projection head"""
    def __init__(self, embedding_dim=768, feat_dim=64, num_class=10):
        super(SupConRobertaNet, self).__init__()
#         model_fun, dim_in = model_dict[name]
        self.embedding_dim = embedding_dim
        self.feat_dim = feat_dim
        self.num_class = num_class
        print(num_class)
        self.encoder = RobertaModel.from_pretrained('notebooks/pretrained_without_wiki/')
#         self.encoder = model_fun()
        self.projection = nn.Sequential(
            nn.Linear(self.embedding_dim, self.embedding_dim),
            nn.ReLU(inplace=True),
            nn.Linear(self.embedding_dim, self.feat_dim)
        )
        self.fc =  nn.Linear(self.embedding_dim, self.num_class)

    def forward(self, iscontra, x ):
        if iscontra == True :
            r = self.encoder(x)
            r = r[0][:,0,:]
            z = F.normalize(self.projection(r), dim=1)
            return z
        else :
            r = self.encoder(x)
            r = r[0][:,0,:]
            r = self.fc(r)
            return r
           

In [3]:
class PetDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 2]
        label = self.df.iloc[idx, 4]
#         text = self.df.text
#         lable = self.df.label_id
        return text, label

In [4]:
train_df = pd.read_csv('notebooks/files/train2.csv')
# val_df = pd.read_csv('files/val.csv')
test_df = pd.read_csv('notebooks/files/test2.csv')
# Num_class = len(set(train_df.label.value_counts()))
Num_class = 10
print(Num_class)

10


In [5]:
# train_df.label_id.value_counts()

In [6]:
BATCH_SIZE = 8
MAX_SEQ_LEN = 512

In [7]:
# device = torch.device('cpu')
device = torch.device('cuda')

In [8]:
tokenizer = RobertaTokenizer.from_pretrained('notebooks/pretrained_without_wiki/', do_lower_case=False)
model = SupConRobertaNet(num_class=Num_class)
# model = AlbertModel.from_pretrained('albert-base-v2')
# print(model)
criterion = SupConLoss(temperature=0.07)
# criterion = torch.nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

10


In [9]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2
)

test_dataset = PetDataset(test_df)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2
)

In [10]:
LEARN_RATE = 0.00001
optimizer = Adam(
    model.parameters(), lr=LEARN_RATE, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01
)
scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/2) + 1)
)

In [11]:
# # supervised contrastive learning
# # import sys
# model.train()
# epochs = 5
# # for index, parameter in enumerate(model.encoder.parameters()):
# #     if index < 2:
# # #         print(parameter.size())
# #         print(parameter.data[0][0])  

# for epoch in range(1, epochs +1) :
#     losses = AverageMeter()
#     count = 0
#     tloss = 0
#     for texts, labels in train_loader:
# #         print(labels)
#         aug_texts = []
#         for text in texts :
#             aug_text = text_aug(text)
#             aug_texts.append(aug_text)
            
#         encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=MAX_SEQ_LEN, truncation=True, padding=True) for t in texts]
#         aug_encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=MAX_SEQ_LEN, truncation=True, padding=True) for t in aug_texts]
#         padded_list = [e[:MAX_SEQ_LEN] + [0] * (MAX_SEQ_LEN - len(e[:MAX_SEQ_LEN])) for e in encoded_list]
#         aug_padded_list = [e[:MAX_SEQ_LEN] + [0] * (MAX_SEQ_LEN - len(e[:MAX_SEQ_LEN])) for e in aug_encoded_list]
#         sample = torch.tensor(padded_list)
#         aug_sample = torch.tensor(aug_padded_list)
#         samples = torch.cat([sample, aug_sample], dim=0)     
#         samples, labels = samples.to(device), labels.to(device)
        
#         labels = torch.tensor(labels)
#         batch_size = labels.shape[0]
#         outputs = model(True, samples) # projection layer
# #         print(torch.matmul(outputs, outputs.T))
        
#         z1, z2 = torch.split(outputs, [batch_size, batch_size], dim=0)
#         features = torch.cat([z1.unsqueeze(1), z2.unsqueeze(1)], dim=1)
# #         features = outputs.unsqueeze(1)
        
#         loss = criterion(features, labels)
#         losses.update(loss.item(), batch_size)
# #         print(loss.item())  
#         # print info
#         if (count + 1) % 100 == 0:
#             print('Train: [{0}][{1}/{2}]\t'
#                   'loss {loss.val:.5f} ({loss.avg:.5f})'.format(
#                    epoch, count + 1, len(train_loader), loss=losses))        
#         tloss += loss.item()
#         count += 1

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#     scheduler.step()
        
#     print('***********************************')
#     print(tloss) 

In [None]:
# supervised contrastive learning
# import sys
model.train()
epochs = 5
for epoch in range(1, epochs +1) :
    losses = AverageMeter()
    count = 0
    tloss = 0
    for texts, labels in train_loader:
#         print(labels)
            
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=MAX_SEQ_LEN, truncation=True, padding=True) for t in texts]
        padded_list = [e[:MAX_SEQ_LEN] + [0] * (MAX_SEQ_LEN - len(e[:MAX_SEQ_LEN])) for e in encoded_list]
        sample = torch.tensor(padded_list)    
        sample, labels = sample.to(device), labels.to(device)
        
        labels = torch.tensor(labels)
        batch_size = labels.shape[0]
        outputs = model(True, sample) # projection layer
#         print(torch.matmul(outputs, outputs.T))
        features = outputs.unsqueeze(1)
        
        loss = criterion(features, labels)
        losses.update(loss.item(), batch_size)
#         print(loss.item())  
        # print info
        if (count + 1) % 100 == 0:
            print('Train: [{0}][{1}/{2}]\t'
                  'loss {loss.val:.5f} ({loss.avg:.5f})'.format(
                   epoch, count + 1, len(train_loader), loss=losses))        
        tloss += loss.item()
        count += 1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()
        
    print('***********************************')
    print(tloss) 

In [13]:
def model_eval(test_df) :
    model.eval()

    test_dataset = PetDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

    total_loss = 0
    total_len = 0
    total_correct = 0

    for text, label in test_loader:
    #   encoded_list = [tokenizer.encode(t, add_special_token=True) for t in text]
      encoded_list = [tokenizer.encode(t, max_length=512, truncation=True) for t in text]
      padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
      sample = torch.tensor(padded_list)
      sample, label = sample.to(device), label.to(device)
      label = torch.tensor(label)
      outputs = model(False, sample)
      logits = outputs

      pred = torch.argmax(F.softmax(logits), dim=1)
      correct = pred.eq(label)
      total_correct += correct.sum().item()
      total_len += len(label)

    print('Test accuracy: ', total_correct / total_len)  

In [14]:
optimizer = Adam(model.parameters(), lr=LEARN_RATE/5)

scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / (int(epoch/3) + 1)
)
epochs = 5
criterion = torch.nn.CrossEntropyLoss()

In [15]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2
)

In [16]:
epochs = 10
MAX_SEQ_LEN = 512
model.train()

for epoch in range(epochs):
  total_loss = 0
  total_len = 0
  total_correct = 0
  total_count = 0
  model.train()
  for text, label in train_loader:           
    encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=MAX_SEQ_LEN, truncation=True, padding=True) for t in texts]
    padded_list = [e[:MAX_SEQ_LEN] + [0] * (MAX_SEQ_LEN - len(e[:MAX_SEQ_LEN])) for e in encoded_list]
    sample = torch.tensor(padded_list)  
    sample, label = sample.to(device), label.to(device)

    label = torch.tensor(label)
    batch_size = label.shape[0]
    outputs = model(False, sample) # classifier layer
#     print(outputs.size())

    loss = criterion(outputs, label)
    losses.update(loss.item(), batch_size)
    pred = torch.argmax(F.softmax(outputs), dim=1)
    correct = pred.eq(label)
#     print(label)
#     print(correct)

    total_correct += correct.sum().item()
    total_len += len(label)
    total_loss += loss.item()
    total_count += 1        

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()  
  
  scheduler.step()
  model_eval(test_df)
#   model_eval(test_df)

  print('[Epoch {}/{}] Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch + 1, epochs, total_loss / total_count, total_correct / total_len))



Test accuracy:  0.15546218487394958
[Epoch 1/10] Train Loss: 2.2875, Accuracy: 0.127
Test accuracy:  0.17647058823529413
[Epoch 2/10] Train Loss: 2.2744, Accuracy: 0.136


KeyboardInterrupt: 