In [1]:
# !pip install transformers
# !pip install torchvision

In [None]:
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from torch.optim import Adam, lr_scheduler
import torch.nn.functional as F
import sys
sys.path.append('../')

import os
import argparse
import json
import torch.nn as nn

from util import *
from losses import LabelSmoothingCrossEntropy, SupConLoss
from augment import *

from torch.utils.data.dataset import ConcatDataset
# from torch_model import SupConRobertaNet, SupConMultiRobertaNet
from torch.utils.data.sampler import RandomSampler
from torch_model import TransferRobertaNet
from feed import PetDataset
from torchsampler import ImbalancedDatasetSampler
from losses import FocalLoss

In [2]:
BATCH_SIZE = 12
MAX_SEQ_LEN = 512

train_df = pd.read_csv('files/unlabel_train.csv')
test_df = pd.read_csv('files/unlabel_test.csv')
Num_Label = len(train_df.label_id.value_counts())
print(Num_Label)
print(train_df.shape)
print(test_df.shape)

150
(37344, 5)
(6591, 5)


In [None]:
device = torch.device("cuda")
# device = torch.device('cpu')
pretrained_path = './pretrained_without_wiki/'
tokenizer = RobertaTokenizer.from_pretrained(pretrained_path, do_lower_case=False)
model = TransferRobertaNet(path=pretrained_path,                       
                              embedding_dim=768,
                              num_class=Num_Label,
                              num_class1=10)
model.to(device)

In [4]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
# indices = list(range(len(train_dataset)))
# num_samples = len(indices)
# train_loader = torch.utils.data.DataLoader(
#     train_dataset, 
#     sampler=ImbalancedDatasetSampler(train_df, indices=indices, num_samples=num_samples),
#     batch_size=12, 
#     shuffle=False, 
#     num_workers=2
# )

In [5]:
optimizer = Adam(model.parameters(), lr=0.00008)

scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/4) + 1)
)

In [6]:
def model_eval(test_df, model, istransfer=True) :
    model.eval()

    test_dataset = PetDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

    total_loss = 0
    total_len = 0
    total_correct = 0

    for text, label in test_loader:
        #   encoded_list = [tokenizer.encode(t, add_special_token=True) for t in text]
          encoded_list = [tokenizer.encode(t, max_length=512, truncation=True) for t in text]
          padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
          sample = torch.tensor(padded_list)
          sample, label = sample.to(device), label.to(device)
          labels = torch.tensor(label)
          outputs = model(sample=sample, istransfer=istransfer)

          pred = torch.argmax(F.softmax(outputs), dim=1)
          correct = pred.eq(labels)
          total_correct += correct.sum().item()
          total_len += len(labels)

    print('Test accuracy: ', total_correct / total_len)
    return total_correct / total_len

In [7]:
criterion = FocalLoss(alpha=0.97, reduce=True)
criterion = criterion.to(device)

epochs = 15
model.train()
high_acc = 0
for epoch in range(epochs):
    total_loss = 0
    total_len = 0
    total_correct = 0
    total_count = 0
    model.train()
    for text, label in train_loader:
#         print(label)

        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample=sample, istransfer=True)
#         print(outputs.size())
#         loss, logits = outputs

        loss = criterion(outputs, label)

        pred = torch.argmax(F.softmax(outputs), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        total_count += 1
        if total_count % 1000 == 0 :
            print('loss ', total_loss/total_count)

        optimizer.zero_grad()    
        loss.backward()
        optimizer.step()
    scheduler.step()
    accr = model_eval(test_df, model)
    if accr > high_acc :
        high_acc = accr
        torch.save(model.state_dict(), 'transfer')
        print('model is saved')

    print('[Epoch {}/{}] Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch + 1, epochs, total_loss / total_count, total_correct / total_len))
# torch.save(model.state_dict(), 'pretrain')    



KeyboardInterrupt: 

In [8]:
model.load_state_dict(torch.load('transfer'))

<All keys matched successfully>

In [9]:
train_df = pd.read_csv('files/train2.csv')
test_df = pd.read_csv('files/test2.csv')
Label_num = len(train_df.label_id.value_counts())
print(Label_num)
print(train_df.shape)
print(test_df.shape)

10
(4280, 5)
(476, 5)


In [13]:
train_df.label_id.value_counts()

5    589
4    555
7    522
2    503
8    433
3    424
6    339
1    337
0    301
9    277
Name: label_id, dtype: int64

In [10]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True, num_workers=2)

In [11]:
optimizer = Adam(model.parameters(), lr=0.00002)

scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/4) + 1)
)

In [12]:
# criterion = FocalLoss(alpha=0.97,gamma=1, reduce=True)
criterion = torch.nn.CrossEntropyLoss()
criterion = criterion.to(device)

epochs = 10
model.train()
high_acc = 0
for epoch in range(epochs):
    total_loss = 0
    total_len = 0
    total_correct = 0
    total_count = 0
    model.train()
    for text, label in train_loader:
#         print(label)

        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample=sample, istransfer=False)
#         print(outputs.size())
#         loss, logits = outputs

        loss = criterion(outputs, label)

        pred = torch.argmax(F.softmax(outputs), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        total_count += 1
        if total_count % 200 == 0 :
            print('loss ', total_loss/total_count)

        optimizer.zero_grad()    
        loss.backward()
        optimizer.step()
    scheduler.step()
    accr = model_eval(test_df, model, istransfer=False)
    if accr > high_acc :
        high_acc = accr
        torch.save(model.state_dict(), 'tune1')

    print('[Epoch {}/{}] Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch + 1, epochs, total_loss / total_count, total_correct / total_len))
# torch.save(model.state_dict(), 'pretrain')    



loss  1.3715157258510589




Test accuracy:  0.6722689075630253
[Epoch 1/10] Train Loss: 1.1867, Accuracy: 0.573
loss  0.7390142171084881
Test accuracy:  0.7016806722689075
[Epoch 2/10] Train Loss: 0.7118, Accuracy: 0.708


KeyboardInterrupt: 