pretrain

In [1]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, cached_path, BertForSequenceClassification
import os
import re
import random
from torch.utils.data import TensorDataset, random_split,RandomSampler, SequentialSampler
from collections import namedtuple
from torch.utils.data import DataLoader
from ignite.engine import Engine, Events
from ignite.metrics import RunningAverage, Accuracy
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import CosineAnnealingScheduler, PiecewiseLinear, create_lr_scheduler_with_warmup, ProgressBar
import pandas as pd
from transformers import get_linear_schedule_with_warmup, AdamW

In [2]:
train=pd.read_csv('./NLP/camp_dataset/sim_question_train.txt',sep="@@@@@",encoding='utf-8',header=None)
test=pd.read_csv('./NLP/camp_dataset/sim_question_test.txt',sep="@@@@@",encoding='utf-8',header=None)

  """Entry point for launching an IPython kernel.
  


In [3]:
def norm(data):
    for i in range(2):
        #data[i]=pd.Series([unicodedata.normalize("NFKD", data[i][j]) for j in range(data.shape[1])])
        data[i]=data[i].str.replace("\xa0","")
        data[i]=data[i].str.replace("$","")
        data[i]=data[i].str.replace(" {2,5}","")
        data[i]=data[i].str.replace("times","*")
        data[i]=data[i].str.replace("div","/")
        data[i]=data[i].str.replace("frac","/")
        data[i]=data[i].str.replace("^"," 次方")
        data[i]=data[i].str.replace("+"," 加")
        data[i]=data[i].str.replace("-"," 减")
        data[i]=data[i].str.replace("\}\{","/")
        data[i]=data[i].str.replace("[A-Za-z]{2,10}","") 
        data[i]=data[i].str.replace("\}","")
        data[i]=data[i].str.replace("\{","")
        #data[i]=data[i].str.replace(" ","")
        data[i]=data[i].str.replace("…"," 等")
        #data[i]=data[i].str.replace("—","")
        #data[i]=data[i].str.replace("“|”","\"")
        data[i]=data[i].str.replace("∵"," 因")
        data[i]=data[i].str.replace("∴"," 故")
        data[i]=data[i].str.replace("馒","")
        data[i]=data[i].str.replace("茼","")
        data[i]=data[i].str.replace("荤","")
        data[i]=data[i].str.slice(0,511)
    return data
train=norm(train)
test=norm(test)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',do_lower_case=False,do_basic_tokenize=False)

In [5]:
def buildlist(data):
    L=[]
    for i in range(data.shape[0]):
        L=L+[[tokenizer.convert_tokens_to_ids(tokenizer.tokenize(str(data.iloc[i,j]))) for j in range(2)]]
    return L
TRAIN=buildlist(train)
TEST=buildlist(test)

In [6]:
Config = namedtuple('Config',
  field_names="embed_dim, hidden_dim, num_max_positions, num_embeddings      , num_heads, num_layers," 
              "dropout, initializer_range, batch_size, lr, max_norm, n_epochs, n_warmup,"
              "mlm, gradient_accumulation_steps, device, log_dir, dataset_cache")
args = Config( 320     , 900    , 320             , len(tokenizer.vocab), 8       , 12   ,
               0.1    , 0.08             , 32        , 5e-4, 0.8, 50    , 1000    ,
               False, 4, "cuda" if torch.cuda.is_available() else "cpu", "./"   , "./dataset_cache.bin")
AdaptationConfig = namedtuple('AdaptationConfig',
  field_names="num_classes, dropout, initializer_range, batch_size, lr, max_norm, n_epochs,"
              "n_warmup, valid_set_prop, gradient_accumulation_steps, device,"
              "log_dir, dataset_cache, decreasing_factor")
adapt_args = AdaptationConfig(
               2          , 0.1    , 0.08             , 32        , 6.5e-4, 0.5   , 50,
               10      , 0.1           , 1, "cuda" if torch.cuda.is_available() else "cpu",
               "./"   , "./dataset_cache.bin", 2.6)

In [7]:
datasets={'train':TRAIN,'valid':TEST}

In [8]:
class Transformer(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal):
        super().__init__()
        self.causal = causal
        self.tokens_embeddings = nn.Embedding(num_embeddings, embed_dim)
        self.position_embeddings = nn.Embedding(num_max_positions, embed_dim)
        self.dropout = nn.Dropout(dropout)

        self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList()
        self.layer_norms_1, self.layer_norms_2 = nn.ModuleList(), nn.ModuleList()
        for _ in range(num_layers):
            self.attentions.append(nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout))
            self.feed_forwards.append(nn.Sequential(nn.Linear(embed_dim, hidden_dim),
                                                    nn.GELU(),
                                                    nn.Linear(hidden_dim, embed_dim),
                                                     v ))
            self.layer_norms_1.append(nn.LayerNorm(embed_dim, eps=1e-8))
            self.layer_norms_2.append(nn.LayerNorm(embed_dim, eps=1e-8))

    def forward(self, x, padding_mask=None):
        """ x has shape [seq length, batch], padding_mask has shape [batch, seq length] """
        positions = torch.arange(len(x), device=x.device).unsqueeze(-1)
        h = self.tokens_embeddings(x)
        h = h + self.position_embeddings(positions).expand_as(h)
        h = self.dropout(h)

        attn_mask = None
        if self.causal:
            attn_mask = torch.full((len(x), len(x)), -float('Inf'), device=h.device, dtype=h.dtype)
            attn_mask = torch.triu(attn_mask, diagonal=1)

        for layer_norm_1, attention, layer_norm_2, feed_forward in zip(self.layer_norms_1, self.attentions,
                                                                       self.layer_norms_2, self.feed_forwards):
            h = layer_norm_1(h)
            x, _ = attention(h, h, h, attn_mask=attn_mask, need_weights=False, key_padding_mask=padding_mask)
            x = self.dropout(x)
            h = x + h

            h = layer_norm_2(h)
            x = feed_forward(h)
            x = self.dropout(x)
            h = x + h
            #print(h)
        return h

In [9]:
class TransformerWithClfHead(nn.Module):
    def __init__(self, config, fine_tuning_config):
        """ Transformer with a language modeling head on top (tied weights) """
        super().__init__()
        self.config = config
        self.transformer = Transformer(config.embed_dim, config.hidden_dim, config.num_embeddings,
                                       config.num_max_positions, config.num_heads, config.num_layers,
                                       config.dropout, causal=not config.mlm)

        self.classification_head = nn.Linear(config.embed_dim, fine_tuning_config.num_classes)
        self.apply(self.init_weights)

    def init_weights(self, module):
        """ initialize weights - nn.MultiheadAttention is already initalized by PyTorch (xavier) """
        if isinstance(module, (nn.Linear, nn.Embedding, nn.LayerNorm)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if isinstance(module, (nn.Linear, nn.LayerNorm)) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, x, clf_tokens_mask, clf_labels=None, padding_mask=None):
        """ x has shape [seq length, batch], padding_mask has shape [batch, seq length] """
        hidden_states = self.transformer(x, padding_mask)
        clf_tokens_states = (hidden_states * clf_tokens_mask.unsqueeze(-1).float()).sum(dim=0)
        clf_logits = self.classification_head(clf_tokens_states)

        if clf_labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(clf_logits.view(-1, clf_logits.size(-1)), clf_labels.view(-1))
            return clf_logits, loss
        return clf_logits

In [10]:
for split_name in ['train', 'valid']:   
    datasets[split_name] = [[x[0][max(len(x[0])//2-args.num_max_positions*len(x[0])//(len(x[0])+len(x[1]))//2,0):len(x[0])//2+args.num_max_positions*len(x[0])//2//(len(x[0])+len(x[1]))]] + 
                            [x[1][max(len(x[1])//2-args.num_max_positions*len(x[1])//(len(x[0])+len(x[1]))//2,0):len(x[1])//2+args.num_max_positions*len(x[1])//2//(len(x[0])+len(x[1]))]] 
                            for x in datasets[split_name]]   
    datasets[split_name] = [x[0] + [tokenizer.vocab['[PAD]']] * (args.num_max_positions - len(x[0])-len(x[1]))+ x[1]  
                            for x in datasets[split_name]]

In [11]:
datasets2={'train':0,'test':1}

In [12]:
#datasets2['train'] = [x[:args.num_max_positions-1]+[tokenizer.vocab['[CLS]']] for x in datasets['train']]
#datasets2['test'] = [x[:args.num_max_positions-1]+[tokenizer.vocab['[CLS]']] for x in datasets['valid']]  
datasets2['train'] = [[tokenizer.vocab['[CLS]']]+x[:args.num_max_positions-1] for x in datasets['train']]
datasets2['test'] = [[tokenizer.vocab['[CLS]']]+x[:args.num_max_positions-1] for x in datasets['valid']] 
tensor = torch.tensor(datasets2['train'], dtype=torch.long)
labels = torch.tensor(train[2], dtype=torch.long) 
datasets2['train'] = TensorDataset(tensor, labels)
datasets2['test'] = TensorDataset(torch.tensor(datasets2['test'], dtype=torch.long))

In [13]:
valid_size = int(adapt_args.valid_set_prop * len(datasets['train']))
train_size = len(datasets2['train']) - valid_size
valid_dataset, train_dataset = random_split(datasets2['train'], [valid_size, train_size])

train_loader = DataLoader(train_dataset, batch_size=adapt_args.batch_size, shuffle=True,drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=adapt_args.batch_size, shuffle=False,drop_last=True)
test_loader = DataLoader(datasets2['test'], batch_size=adapt_args.batch_size, shuffle=False)

In [14]:
adaptation_model = TransformerWithClfHead(args,fine_tuning_config=adapt_args).to(adapt_args.device)

In [15]:
optimizer = torch.optim.Adam(adaptation_model.parameters(), lr=args.lr)

def update(engine, batch):
    adaptation_model.train()
    batch, labels = (t.to(adapt_args.device) for t in batch)
    inputs = batch.transpose(0, 1).contiguous()  # to shape [seq length, batch]
    _, loss = adaptation_model(inputs, clf_tokens_mask=(inputs == tokenizer.vocab['[CLS]']), clf_labels=labels,
                               padding_mask=(batch == tokenizer.vocab['[PAD]']))
    loss = loss / adapt_args.gradient_accumulation_steps
    loss.backward()
    torch.nn.utils.clip_grad_norm_(adaptation_model.parameters(), adapt_args.max_norm)
    if engine.state.iteration % adapt_args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()
trainer = Engine(update)

# Evaluation function and evaluator (evaluator output is the input of the metrics)
def inference(engine, batch):
    adaptation_model.eval()
    with torch.no_grad():
        batch, labels = (t.to(adapt_args.device) for t in batch)
        inputs = batch.transpose(0, 1).contiguous()  # to shape [seq length, batch]
        clf_logits = adaptation_model(inputs, clf_tokens_mask=(inputs == tokenizer.vocab['[CLS]']),
                                      padding_mask=(batch == tokenizer.vocab['[PAD]']))
    return clf_logits, labels
evaluator = Engine(inference)

# Attache metric to evaluator & evaluation to trainer: evaluate on valid set after each epoch
Accuracy().attach(evaluator, "accuracy")
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    evaluator.run(valid_loader)
    print(f"Validation Epoch: {engine.state.epoch} Error rate: {100*(1 - evaluator.state.metrics['accuracy'])}")

# Learning rate schedule: linearly warm-up to lr and then to zero
scheduler = PiecewiseLinear(optimizer, 'lr', [(0, 9e-8), (adapt_args.n_warmup, adapt_args.lr),
                                              (len(train_loader)*adapt_args.n_epochs, 9e-8)])
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

# Add progressbar with loss
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
ProgressBar(persist=True).attach(trainer, metric_names=['loss'])

# Save checkpoints and finetuning config
checkpoint_handler = ModelCheckpoint(adapt_args.log_dir, 'finetuning_checkpoint', save_interval=1, require_empty=False)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': adaptation_model})
torch.save(args, os.path.join(adapt_args.log_dir, 'fine_tuning_args.bin'))



In [16]:
?PiecewiseLinear

In [17]:
trainer.run(train_loader, max_epochs=args.n_epochs)

HBox(children=(IntProgress(value=0, max=506), HTML(value='')))

Validation Epoch: 1 Error rate: 50.16741071428572



Engine run is terminating due to exception: [WinError 183] Cannot create a file when that file already exists: 'C:\\Users\\lycan\\Documents\\cu\\hackathon\\tmps8f_wide' -> './finetuning_checkpoint_mymodel_506.pth'.


FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'C:\\Users\\lycan\\Documents\\cu\\hackathon\\tmps8f_wide' -> './finetuning_checkpoint_mymodel_506.pth'

In [None]:
torch.cuda.empty_cache()