# _This code runs in the notebook itself_

# Fine-Tuning a RoBERTa Model

In [None]:
!pip install easydict==1.9

In [None]:
!pip list

In [None]:
import os
import easydict

In [None]:
os.environ['SM_HOSTS']='localhost'
os.environ['SM_CURRENT_HOST']='localhost'
os.environ['SM_MODEL_DIR']='./model'
os.environ['SM_CHANNEL_TRAIN']='./data/sentiment-train'
os.environ['SM_CHANNEL_VALIDATION']='./data/sentiment-validation'
os.environ['SM_CHANNEL_TEST']='./data/sentiment-test'
os.environ['SM_OUTPUT_DIR']='./output'
os.environ['SM_NUM_GPUS']='0'

In [None]:
args=easydict.EasyDict({
    'train_batch_size': 8,
    'train_steps_per_epoch': 1,
    'validation_batch_size': 8,
    'test_batch_size': 8,
    'epochs': 1,
    'lr': 2e-5,
    'momentum': 0.5,
    'seed': 42,
    'log_interval': 10,
    'backend': 'gloo',
    'max_seq_len': 64,
    'model_name': 'roberta-base',
    'enable_sagemaker_debugger': True,
    'run_validation' : False,
    'run_test': False,
    'run_sample_predictions': False,
    'hosts': 'localhost',
    'current_host': 'localhost', 
    'model_dir': './model',
    'train_data': './data/sentiment-train',
    'validation_data': './data/sentiment-validation',
    'test_data': './data/sentiment-test',
    'output_dir': './output',
    'num_gpus': 0,
    'save-frequency': 10,
    'smdebug_path': '/opt/ml/output/tensors',
    'hook-type': 'saveall'
})

In [None]:
import argparse
import json
import logging
import glob
import os
import sys
import random

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed

import pandas as pd
import numpy as np
from collections import defaultdict

from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import RobertaForSequenceClassification

import smdebug.pytorch as smd
from smdebug.pytorch import Hook, SaveConfig
from smdebug import modes

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

# Has to be called 'model.pth'
MODEL_NAME = 'model.pth'
PRE_TRAINED_MODEL_NAME = 'roberta-base'

DATA_COLUMN = 'review_body'
LABEL_COLUMN = 'sentiment'
LABEL_VALUES = [-1, 0, 1]
CLASS_NAMES = ['negative', 'neutral', 'positive']

LABEL_MAP = {}
for (i, label) in enumerate(LABEL_VALUES):
    LABEL_MAP[label] = i
    
    
def parse_args():

    parser = argparse.ArgumentParser()
    ###### CLI args
    parser.add_argument('--train_batch_size', 
                        type=int, 
                        default=128, metavar='N',
                        help='input batch size for training (default: 128)')

    parser.add_argument('--validation_batch_size', 
                        type=int, 
                        default=128, metavar='N',
                        help='input batch size for validation (default: 128)') 
    
    parser.add_argument('--test_batch_size', 
                        type=int, 
                        default=128, metavar='N',
                        help='input batch size for testing (default: 128)')
    
    parser.add_argument('--epochs', 
                        type=int, 
                        default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    
    parser.add_argument('--lr', 
                        type=float, 
                        default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    
    parser.add_argument('--momentum', 
                        type=float, 
                        default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    
    parser.add_argument('--seed', 
                        type=int, 
                        default=42, metavar='S',
                        help='random seed (default: 1)')
    
    parser.add_argument('--log_interval', 
                        type=int, 
                        default=100, metavar='N',
                        help='how many batches to wait before logging training status')
    
    parser.add_argument('--backend', 
                        type=str, 
                        default=None,
                        help='backend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu)')
    
    parser.add_argument('--max_seq_len', 
                        type=int, 
                        default=64, 
                        help='max sequence length of input tokens')
    
    parser.add_argument("--model_name", 
                        type=str, 
                        default=MODEL_NAME, 
                       help='Model name')
    
    parser.add_argument('--enable_sagemaker_debugger', 
                        type=eval, 
                        default=False)
    
    parser.add_argument('--run_validation', 
                        type=eval,
                        default=False)  
    
    parser.add_argument('--run_test', 
                        type=eval, 
                        default=False)    
    
    parser.add_argument('--run_sample_predictions', 
                        type=eval, 
                        default=False)
    
    parser.add_argument('--train_steps_per_epoch',
                        type=int,
                        default=None)

    ###### Container environment   
    parser.add_argument('--hosts', 
                        type=list, 
                        default=json.loads(os.environ['SM_HOSTS']))
    
    parser.add_argument('--current_host', 
                        type=str, 
                        default=os.environ['SM_CURRENT_HOST'])
    
    parser.add_argument('--model_dir', 
                        type=str, 
                        default=os.environ['SM_MODEL_DIR'])

    parser.add_argument('--train_data', 
                        type=str, 
                        default=os.environ['SM_CHANNEL_TRAIN'])
    
    parser.add_argument('--validation_data', 
                        type=str, 
                        default=os.environ['SM_CHANNEL_VALIDATION'])
    
    parser.add_argument('--test_data', 
                        type=str, 
                        default=os.environ['SM_CHANNEL_TEST'])
    
    parser.add_argument('--output_dir', 
                        type=str, 
                        default=os.environ['SM_OUTPUT_DIR'])
    
    parser.add_argument('--num_gpus', 
                        type=int, 
                        default=os.environ['SM_NUM_GPUS'])
    
    # Debugger Args
    parser.add_argument("--save-frequency", 
                        type=int, 
                        default=10, 
                        help="frequency with which to save steps")
    
    parser.add_argument("--smdebug_path",
                        type=str,
                        help="output directory to save data in",
                        default="/opt/ml/output/tensors",)
    
    parser.add_argument("--hook-type",
                        type=str,
                        choices=["saveall", "module-input-output", "weights-bias-gradients"],
                        default="saveall",)

    print(sys.argv)
    return parser.parse_args(sys.argv[1:])



class ReviewDataset(Dataset):

    def __init__(self, reviews, targets, tokenizer, max_seq_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
          review,
          add_special_tokens=True,
          max_length=self.max_seq_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )
        
        tokens = self.tokenizer.tokenize(review)[0:self.max_seq_len] #, truncation=True, max_length=self.max_seq_len)
        tokens += [''] * (self.max_seq_len - len(tokens))

        # print('type(tokens): {}'.format(type(tokens)))        
        # print('type(encoding.input_ids): {}'.format(type(encoding['input_ids'].flatten())))
        # print('encoding.input_ids: {}'.format(encoding['input_ids'].flatten()))
        print('************** START **************************')
        print('tokens: {}'.format(tokens))
        print('encoding.input_ids: {}'.format(encoding['input_ids'].flatten()))
        print('encoding.input_ids.flatten().tolist()[1:]: {}'.format(encoding['input_ids'].flatten().tolist()[1:]))

        print('list len encoding.input_ids.flatten(): {}'.format(len(encoding['input_ids'].flatten())))
        print('list len encoding.input_ids.flatten().tolist()[1:]: {}'.format(len(encoding['input_ids'].flatten().tolist()[1:])))
        print('list len tokens: {}'.format(len(tokens)))
        print('encoding.input_ids.flatten().tolist()[1:]: {}'.format(encoding['input_ids'].flatten().tolist()[1:]))
        
        tokens_to_input_ids = zip(tokens, encoding['input_ids'].flatten().tolist()[1:])
        print('tokens_to_input_ids: {}'.format(tokens_to_input_ids))
        for token, input_id in zip(tokens, encoding['input_ids'].flatten().tolist()[1:]):
              print('token: {}'.format(token))
              print('input_id: {}'.format(input_id))
        print('**************** END ************************')
        
        # TODO: CHECK IF TOKENS IS GOOD ENOUGH SIMILAR TO NATHALIE'S EXAMPLE   
        return encoding['input_ids'].flatten(), torch.tensor(target, dtype=torch.long), tokens

    
def create_list_input_files(path):
    input_files = glob.glob('{}/*.tsv'.format(path))
    print(input_files)
    return input_files

    
def create_data_loader(path, tokenizer, max_seq_len, batch_size):
    logger.info("Get data loader")

    df = pd.DataFrame(columns=['sentiment', 'review_body'])
    
    input_files = create_list_input_files(path)

    for file in input_files:
        df_temp = pd.read_csv(file, 
                              sep='\t', 
                              usecols=['sentiment', 'review_body']
                             )
        df = df.append(df_temp)
        
    print(len(df))
    print('df[sentiment]: {}'.format(df['sentiment']))
    
    df['sentiment'] = df.sentiment.apply(lambda sentiment: LABEL_MAP[sentiment])
    print('df[sentiment] after LABEL_MAP: {}'.format(df['sentiment']))
    print(df.head())
    
    ds = ReviewDataset(
        reviews=df.review_body.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_seq_len=max_seq_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=True
    ), df



# TODO: need to put saved config.json in code/ folder
def save_transformer_model(model, model_dir):
    path = '{}/transformer'.format(model_dir)
    os.makedirs(path, exist_ok=True)                              
    logger.info('Saving Transformer model to {}'.format(path))
    model.save_pretrained(path)


# Needs to saved in model_dir root folder
def save_pytorch_model(model, model_dir):
    # path = '{}/pytorch'.format(model_dir)
    os.makedirs(model_dir, exist_ok=True) 
    logger.info('Saving PyTorch model to {}'.format(model_dir))
    save_path = os.path.join(model_dir, MODEL_NAME)
    torch.save(model.state_dict(), save_path)
    
def load_transformer_model(model_dir):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    config = RobertaConfig.from_json_file('{}/config.json'.format(model_dir))
    model = RobertaForSequenceClassification.from_pretrained(model_dir, config=config)
    model = model.to(device)
    return model

def load_pytorch_model(model_dir):
    model_path = '{}/{}'.format(model_dir, MODEL_NAME)
    model = RobertaForSequenceClassification()
    if torch.cuda.is_available():
        device = torch.device('cuda')
        model.load_state_dict(torch.load(model_path, map_location='cuda:0'))  
    else:
        device = torch.device('cpu')
        model.load_state_dict(torch.load(model_path, map_location=device))    
    return model
    

def train_model(model,
                train_data_loader,
                df_train,
                val_data_loader, 
                df_val,
                args):
    
#    hook = smd.Hook.create_from_json_file()   
    
    loss_function = nn.CrossEntropyLoss()    
    optimizer = optim.Adam(params=model.parameters(), lr=args.lr)
    
    if args.enable_sagemaker_debugger:
        print('Enable SageMaker Debugger.')

    for epoch in range(args.epochs):
        print('EPOCH -- {}'.format(epoch))

        train_correct = 0
        train_total = 0
        
        for i, (sent, label, tokens) in enumerate(train_data_loader):
#            hook.set_mode(modes.TRAIN)
            model.train()
            optimizer.zero_grad()
            sent = sent.squeeze(0)
            print('tokens: {}'.format(tokens))
            
            if torch.cuda.is_available():
                sent = sent.cuda()
                label = label.cuda()
            output = model.forward(sent)[0]
            _, predicted = torch.max(output, 1)
            
            loss = loss_function(output, label)
            loss.backward()
            optimizer.step()
            
            if i%100 == 0:
                train_total += label.size(0)
                train_correct += (predicted.cpu() == label.cpu()).sum()
                accuracy = 100.00 * train_correct.numpy() / train_total
                print('[epoch: {0} / step: {1}] train_loss: {2:.2f} - train_acc: {3:.2f}%'.format(epoch, i, loss.item(), accuracy))
                        
            if args.run_validation:
                # hook.set_mode(modes.EVAL)

                if i%100 == 0:
                    print('RUNNING VALIDATION:')
                    correct = 0
                    total = 0
                    model.eval()
                    for sent, label, tokens in val_data_loader:
                        sent = sent.squeeze(0)
                        if torch.cuda.is_available():
                            sent = sent.cuda()
                            label = label.cuda()
                        output = model.forward(sent)[0]
                        _, predicted = torch.max(output.data, 1)
                        
                        total += label.size(0)
                        correct += (predicted.cpu() == label.cpu()).sum()
                        
                        print('tokens: {}'.format(tokens))
                
                    accuracy = 100.00 * correct.numpy() / total
                    print('[epoch: {0} / step: {1}] val_loss: {2:.2f} - val_acc: {3:.2f}%'.format(epoch, i, loss.item(), accuracy))

    print('TRAINING COMPLETED.')
    return model

In [None]:
import argparse
import pprint
import json
import logging
import os
import sys
import pandas as pd
import random
import time

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader


from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
from transformers import RobertaForSequenceClassification

# from src.utils_simple import create_data_loader, train_model, parse_args, save_pytorch_model, save_transformer_model
#import create_data_loader, train_model, parse_args, save_pytorch_model, save_transformer_model

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

# Has to be called 'model.pth'
MODEL_NAME = 'model.pth'
PRE_TRAINED_MODEL_NAME = 'roberta-base'

DATA_COLUMN = 'review_body'
LABEL_COLUMN = 'sentiment'
LABEL_VALUES = [-1, 0, 1]
CLASS_NAMES = ['negative', 'neutral', 'positive']

LABEL_MAP = {}
for (i, label) in enumerate(LABEL_VALUES):
    LABEL_MAP[label] = i


#if __name__ == '__main__':

###### Parse ARGS
# args = parse_args()
# print('Loaded arguments:')
# print(args)


###### Get Environment Variables
env_var = os.environ 
print('Environment variables:')
pprint.pprint(dict(env_var), width = 1) 

#    print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
#    sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])

###### Check if Training Master
#    is_master = sm_training_env_json['is_master']
#    print('is_master {}'.format(is_master))

#     if is_master:
#         checkpoint_path = args.checkpoint_base_path
#     else:
#         checkpoint_path = '/tmp/checkpoints'        
#     print('checkpoint_path {}'.format(checkpoint_path))

###### Check if distributed training
# is_distributed = len(args.hosts) > 1 and args.backend is not None
is_distributed = False

logger.debug("Distributed training - {}".format(is_distributed))
use_cuda = args.num_gpus > 0
logger.debug("Number of gpus available - {}".format(args.num_gpus))
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

device = torch.device('cuda' if use_cuda else 'cpu')

if is_distributed:
    ###### Initialize the distributed environment.
    world_size = len(args.hosts)
    os.environ['WORLD_SIZE'] = str(world_size)
    host_rank = args.hosts.index(args.current_host)
    os.environ['RANK'] = str(host_rank)
    dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size)
    logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
        args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format(
        dist.get_rank(), args.num_gpus))

###### Set the seed for generating random numbers
torch.manual_seed(args.seed)
if use_cuda:
    torch.cuda.manual_seed(args.seed) 


###### INSTANTIATE MODEL
tokenizer = None
config = None
model = None

successful_download = False
retries = 0

while (retries < 5 and not successful_download):
    try:
        tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

        config = RobertaConfig.from_pretrained(PRE_TRAINED_MODEL_NAME,
                                               num_labels=len(CLASS_NAMES),
                                               id2label={
                                                   0: -1,
                                                   1: 0,
                                                   2: 1,
                                               },
                                               label2id={
                                                   -1: 0,
                                                   0: 1,
                                                   1: 2,
                                               })
        config.output_attentions=True
        model = RobertaForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, 
                                                                 config=config)
        model.to(device)
        successful_download = True
        print('Sucessfully downloaded after {} retries.'.format(retries))

    except:
        retries = retries + 1
        random_sleep = random.randint(1, 30)
        print('Retry #{}.  Sleeping for {} seconds'.format(retries, random_sleep))
        time.sleep(random_sleep)

if not tokenizer or not model or not config:
     print('Not properly initialized...')

###### CREATE DATA LOADERS
train_data_loader, df_train = create_data_loader(args.train_data, tokenizer, args.max_seq_len, args.train_batch_size)
val_data_loader, df_val = create_data_loader(args.validation_data, tokenizer, args.max_seq_len, args.validation_batch_size)

logger.debug("Processes {}/{} ({:.0f}%) of train data".format(
    len(train_data_loader.sampler), len(train_data_loader.dataset),
    100. * len(train_data_loader.sampler) / len(train_data_loader.dataset)
))

logger.debug("Processes {}/{} ({:.0f}%) of test data".format(
    len(val_data_loader.sampler), len(val_data_loader.dataset),
    100. * len(val_data_loader.sampler) / len(val_data_loader.dataset)
)) 

# model_dir = os.environ['SM_MODEL_DIR']
print('model_dir: {}'.format(args.model_dir))

print('model summary: {}'.format(model))

###### START TRAINING

model = train_model(model,
                    train_data_loader,
                    df_train,
                    val_data_loader, 
                    df_val,
                    args)

save_transformer_model(model, args.model_dir)
save_pytorch_model(model, args.model_dir)