In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import multiprocessing
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModel
from tqdm.notebook import tqdm
from common import Common, Timer
from sklearn.preprocessing import LabelEncoder
import random
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from apex import amp

In [2]:
PATH = 'C:/Users/bill/Documents/projects/data/chatbot'
MAX_WORKERS = multiprocessing.cpu_count() - 1
B = 8
E = 10
T = 500
Y = 199

In [3]:
data = pd.read_csv(os.path.join(PATH, 'samples'), sep='\t', names=[
    'rpt_mnth', 'cnv_id', 'msg_id', 'agent', 'msg_type', 'channel', 
    'creat_ts', 'utterance', 'intent', 'score', 'resp_cd'
])

train_df = pd.read_csv(os.path.join(PATH, 'train.tsv'), sep='\t')[['utterance', 'intent']].dropna().reset_index(drop=True)
valid_df = pd.read_csv(os.path.join(PATH, 'valid.tsv'), sep='\t')[['utterance', 'intent']].dropna().reset_index(drop=True)

intents = LabelEncoder()
intents.fit(pd.concat([ train_df['intent'], valid_df['intent'] ]))

data = data[
    data['agent'].isin(['coremobile']) & \
    data['intent'].isin(intents.classes_) & \
    ~data['utterance'].duplicated(keep=False) & \
    ~data['intent'].isin(['\\N']) 
][['utterance', 'intent']] \
.dropna() \
.drop_duplicates() \
.reset_index(drop=True)

data['intent'] = intents.transform(data['intent'])

skf = StratifiedKFold(n_splits=6)
X, y = data['utterance'], data['intent']
trains = []
tests = []
for train_index, test_index in skf.split(X, y):
    trains.append(Common.generator(
        X[train_index].reset_index(drop=True), 
        y[train_index].reset_index(drop=True), 
    B))
    tests.append(Common.generator(
        X[test_index].reset_index(drop=True), 
        y[test_index].reset_index(drop=True), 
    B))   
    print(X[test_index].shape[0] / X[train_index].shape[0])
    break

0.20000499637762623




In [4]:
class Model(nn.Module):
    
    #model = 'distilbert-base-cased'
    def __init__(self, model, T, Y, device, L=0):
        super(Model, self).__init__()
        self.T = T
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model, do_lower_case=True)
        self.model = AutoModel.from_pretrained(model)
        # remove layers
        for _ in range(L):
            self.model.transformer.layer.__delitem__(-1)
        self.linear = nn.Linear(768, Y)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, inputs):
        tokens = self.tokenizer.batch_encode_plus(
            inputs, 
            add_special_tokens=True, 
            return_tensors='pt', 
            pad_to_max_length=True,
            #max_length=self.T,
            return_attention_masks=False
        )['input_ids'].to(self.device)
        #with torch.no_grad():
        output = self.model(tokens)[0][:,0,:]
        #output = torch.flatten(output, start_dim=1)
        output = self.dropout(output)
        output = self.linear(output)
        return output
    
    
def scoring(device, teacher, student, criterion, iterator):
    with torch.no_grad():
        total_loss = []
        total_accy = []
        for x, y in iterator:
            tlogics = teacher(x)
            slogics = student(x)
            loss = criterion(slogics, tlogics, y.to(device).long())
            total_loss.append(loss.item())
            total_accy.append(Common.accuracy(slogics, y, device))
    return np.mean(total_loss), np.mean(total_accy)


class CELoss(nn.Module):
    
    def __init__(self, ST, TT, alpha):
        super().__init__()
        self.ST = ST
        self.TT = TT
        self.alpha = alpha

    def forward(self, slogits, tlogits, targets):
        B, I = slogits.shape
        y = torch.zeros(slogits.shape).to(device)
        for i in range(y.shape[0]):
            y[i][targets[i]] = 1
        tprobs = F.softmax(tlogits / self.TT, dim=1)
        l_sprobs = F.log_softmax(slogits / self.ST, dim=1)
        loss = self.alpha * torch.bmm(l_sprobs.view(B, 1, I), y.view(B, I, 1)).reshape(-1)
        loss += (1 - self.alpha) * torch.bmm(tprobs.view(B, 1, I), l_sprobs.view(B, I, 1)).reshape(-1)
        return -loss.mean()
    
    
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)

In [5]:
device = Common.device()
#device = torch.device('cpu')
cp = torch.load('./models/teacher.tar', map_location=device)
teacher = Model('distilbert-base-uncased', T, Y, device, L=0).to(device)
teacher.load_state_dict(cp['state_dict'])
teacher.eval()

tlogits = []
for x, y, i in tqdm(trains[0]):
    
    break

HBox(children=(FloatProgress(value=0.0, max=20015.0), HTML(value='')))

('So I am active duty military and still being charged the fee for the card what do you need from me to waive that?', 'Once it is cleared can you go ahead and do the refund so I do not have to come back and do this all over again?', 'Ok im ready', 'I have. 1 pending charge from amazon that I did not make.', 'That’s unacceptable, I’ve never had a late payment, always charge big pay big. Your not going to hose me.', 'I’m seeing an Delta Amex account ending in 1013 that appeared on my Mint app', 'Please take the late fee off on my account', 'I tried to charge membership toShipt today and got an error msg.  What is wrong?') tensor([106, 118,   8,  63,  12,  82, 116,  63], dtype=torch.int32) tensor([ 96723,  87530,  79529, 159083, 110912,  96567,  34107, 126312])



In [6]:
device = Common.device()
student = Model('distilbert-base-uncased', T, Y, device, L=3).to(device)
optimizer = AdamW(student.parameters(), lr = 2e-5, eps = 1e-8, weight_decay = 0.01)
criterion = CELoss(ST=1, TT=2, alpha=0.5)

student, optimizer = amp.initialize(student, optimizer, opt_level='O2')

seed_val = 0

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

best_loss = 999
best_model = None
best_epoch = 0

Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


In [7]:
for epoch in range(E):
    t = Timer()
    total_loss = []
    total_accy = []
    for x, y, i in tqdm(trains[0]):

        teacher.zero_grad()
        student.zero_grad()

        tlogics = teacher(x)
        slogics = student(x)

        loss = criterion(slogics, tlogics, y.to(device).long())
        total_loss.append(loss.item())
        total_accy.append(Common.accuracy(slogics, y, device))

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        
        #loss.backward()
        nn.utils.clip_grad_norm_(student.parameters(), 1.0)

        optimizer.step()

    train_loss, train_accy = np.mean(total_loss), np.mean(total_accy)
    valid_loss, valid_accy = scoring(device, model, criterion, tests[0])

    if valid_loss < best_loss:
        Common.save_checkpoint({
            'loss': valid_loss,
            'accuracy': valid_accy,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }, './models/student.tar')
        best_model = model
        best_epoch = epoch
        best_loss = valid_loss

    print(', '.join([
        'epoch: {}'.format(epoch),
        'train_loss: {:3.2}'.format(train_loss),
        'valid_loss: {:3.2}'.format(valid_loss),
        'train_accy: {:3.2%}'.format(train_accy),
        'valid_accy: {:3.2%}'.format(valid_accy),
        'time: {}'.format(t.get())
    ]))

HBox(children=(FloatProgress(value=0.0, max=20015.0), HTML(value='')))

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0



RuntimeError: CUDA out of memory. Tried to allocate 228.00 MiB (GPU 0; 8.00 GiB total capacity; 5.74 GiB already allocated; 208.04 MiB free; 6.01 GiB reserved in total by PyTorch)