In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.functional")

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
import sys
import os
curruser = os.environ.get('USER')
sys.path.insert(0, '/home/{}/python36-libs/lib/python3.6/site-packages/'.format(curruser))

In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import shutil
import json
import argparse
import time
import sys
import pickle as pkl
import string
from tqdm import tqdm, tqdm_notebook
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
# from torchviz import make_dot
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
import sentencepiece as spm
# from bpe import Encoder
from src.cnn_model import CharacterLevelLSTMCNN
from src.data_loader import MyDataset
from src import utils
from collections import OrderedDict

In [None]:
"{}".format(str(torch._C._cuda_getDriverVersion()))

In [None]:
torch.cuda.get_device_name()

In [None]:
torch.cuda.set_device(0)

In [None]:
def train(model, training_generator, optimizer, criterion, epoch, writer, print_every=1000):
    
    import warnings
    warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.functional")

    model.train()
    losses = []
    accuraries = []
    num_iter_per_epoch = len(training_generator)

    progress_bar = tqdm(enumerate(training_generator),
                                 total=num_iter_per_epoch,
                                 file=sys.stdout)

    for iter, batch in progress_bar:
        features, labels = batch
        if torch.cuda.is_available():
            features = features.cuda()
            labels = labels.cuda()
        optimizer.zero_grad()
        predictions = model(features)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        training_metrics = utils.get_evaluation(labels,
                                                predictions,
                                                list_metrics=["accuracy", "f1"])
        losses.append(loss.item())
        accuraries.append(training_metrics["accuracy"])
        f1 = training_metrics['f1']
               
#         print(training_metrics['accuracy'])
#         print(loss.item())
#         print(f1)
        
        writer.add_scalar('Train/Loss', loss.item(),
                          epoch * num_iter_per_epoch + iter)
        writer.add_scalar(
            'Train/Accuracy', training_metrics['accuracy'], epoch * num_iter_per_epoch + iter)
        writer.add_scalar('Train/f1', f1,
                          epoch * num_iter_per_epoch + iter)

        if iter % print_every == 0:
            print("[Training - Epoch: {}] , Iteration: {}/{} , Loss: {}, Accuracy: {}".format(
                epoch + 1,
                iter + 1,
                num_iter_per_epoch,
                np.mean(losses),
                np.mean(accuraries)
            ))
            
        progress_bar.update(1)

    return np.mean(losses), np.mean(accuraries)

In [None]:
def evaluate(model, validation_generator, criterion, epoch, writer, print_every=500):
    model.eval()
    losses = []
    accuraries = []
    num_iter_per_epoch = len(validation_generator)
    
    progress_bar = tqdm(enumerate(validation_generator), 
                                 total=num_iter_per_epoch,
                                 file=sys.stdout)
    
    for iter, batch in progress_bar:
        features, labels = batch
        if torch.cuda.is_available():
            features = features.cuda()
            labels = labels.cuda()
        with torch.no_grad():
            predictions = model(features)
        loss = criterion(predictions, labels)
        validation_metrics = utils.get_evaluation(labels,
                                                  predictions,
                                                  list_metrics=["accuracy", "f1"])
        accuracy = validation_metrics['accuracy']
        f1 = validation_metrics['f1']
        losses.append(loss.item())
        accuraries.append(accuracy)

        writer.add_scalar('Test/Loss', loss.item(),
                          epoch * num_iter_per_epoch + iter)
        writer.add_scalar('Test/Accuracy', accuracy,
                          epoch * num_iter_per_epoch + iter)
        writer.add_scalar('Test/f1', f1,
                          epoch * num_iter_per_epoch + iter)

        if iter % print_every == 0:
            print("[Validation - Epoch: {}] , Iteration: {}/{} , Loss: {}, Accuracy: {}".format(
                epoch + 1,
                iter + 1,
                num_iter_per_epoch,
                np.mean(losses),
                np.mean(accuraries)))
            
        progress_bar.update(1)

    return np.mean(losses), np.mean(accuraries)

In [None]:
def run(args, both_cases=False):

    log_path = args.log_path
    if os.path.isdir(log_path):
        shutil.rmtree(log_path)
    os.makedirs(log_path)

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    writer = SummaryWriter(log_path)

    batch_size = args.batch_size

    training_params = {"batch_size": batch_size,
                       "shuffle": True,
                       "num_workers": args.workers}

    validation_params = {"batch_size": batch_size,
                         "shuffle": False,
                         "num_workers": args.workers} 
    
    ##########################################################
    # Calculate vocab size for ngram tokenizer
    # And obtain a mapping of ngram vectors on their ids
    ##########################################################
    trigram_map, vocab_size=MyDataset._gen_ngrams()
    args.trigram_map_len = vocab_size+1
    
    #############################################################
    # if useBOCNGrams is True then calculate maxlen variable 
    # corresponding to the total number of ngrams to be generated 
    # from input rawdata
    #############################################################
    maxSeqLen = 120
    maxlen = np.sum([(maxSeqLen-ker)+1 for ker in range(1,5)]) 
    
    #############################################################
    # Calculate vocab size for different pretrained Tokenizers 
    #############################################################
    if args.useSentencePieceTokenizer:
        sp = spm.SentencePieceProcessor()
        sp.Load(args.data_path_to_SentpBPE)            
        args.vocabSize = sp.GetPieceSize()
    elif args.useNGramBPETokenizer:
        bpe = Encoder.load(args.data_path_to_NGramBPE)
        args.vocabSize = bpe.vocab_size
        
    #############################################################
    # MAIN: Model instantiation and params initialization
    #############################################################
    model = CharacterLevelLSTMCNN(args)
    # Save Embed weights onto class atribute
    if torch.cuda.is_available():
        model.cuda()  
    if args.usembedding:    
        args._embedding = model._embedding.Embedding.weight.data.cpu().numpy()  
    else:
        args._embedding = None
    full_dataset = MyDataset(args)
    ##########################################################
    
    train_size = int(args.validation_split * len(full_dataset))
    validation_size = len(full_dataset) - train_size
    training_set, validation_set = torch.utils.data.random_split(
        full_dataset, [train_size, validation_size])
    training_generator = DataLoader(training_set, **training_params)
    validation_generator = DataLoader(validation_set, **validation_params)


    criterion = nn.CrossEntropyLoss()
    if args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(
            model.parameters(), lr=args.learning_rate, momentum=0.9
        )
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(
            model.parameters(), lr=args.learning_rate
        )

    best_loss = 1e10
    best_epoch = 0

    for epoch in range(args.epochs):
        training_loss, training_accuracy = train(model,
                                                 training_generator,
                                                 optimizer,
                                                 criterion,
                                                 epoch,
                                                 writer)

        validation_loss, validation_accuracy = evaluate(model,
                                                        validation_generator,
                                                        criterion,
                                                        epoch,
                                                        writer)

        print('[Epoch: {} / {}]\ttrain_loss: {:.4f} \ttrain_acc: {:.4f} \tval_loss: {:.4f} \tval_acc: {:.4f}'.
              format(epoch + 1, args.epochs, training_loss, training_accuracy, validation_loss, validation_accuracy))
        print("=" * 50)

        # learning rate scheduling

        if args.schedule != 0:
            if args.optimizer == 'sgd' and epoch % args.schedule == 0 and epoch > 0:
                current_lr = optimizer.state_dict()['param_groups'][0]['lr']
                current_lr /= 2
                print('Decreasing learning rate to {0}'.format(current_lr))
                for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr

        #  valiearly stopping
        if validation_loss < best_loss:
            best_loss = validation_loss
            best_epoch = epoch
            if args.checkpoint == 1:
                torch.save(model, args.output + '{}_epoch_{}_lr_{}_loss_{}_acc_{}.pth'.format(args.model_name,
                                                                                                    epoch,
                                                                                                    optimizer.state_dict()[
                                                                                                        'param_groups'][0]['lr'],
                                                                                                    round(
                                                                                                        validation_loss, 4),
                                                                                                    round(
                                                                                                        validation_accuracy, 4)
                                                                                                    ))

        if epoch - best_epoch > args.patience > 0:
            print("Stop training at epoch {}. The lowest loss achieved is {} at epoch {}".format(
                epoch, validation_loss, best_epoch))
            break


In [None]:
maxSeqLen = 120
maxlen = np.sum([(maxSeqLen-ker)+1 for ker in range(1,5)]) 
maxlen

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        'Character Based CNN for text classification')
    parser.add_argument('--data_path', type=str, default='./data/dnaseg/csv/DnaSeg4TrainwLabelsSampled_wRandL.csv')
    parser.add_argument('--validation_split', type=float, default=0.95)
    parser.add_argument('--label_column', type=str, default='label')
    parser.add_argument('--text_column', type=str, default='seq')
    parser.add_argument('--max_rows', type=int, default=6000000)
    parser.add_argument('--chunksize', type=int, default=1000000)
    parser.add_argument('--encoding', type=str, default='utf8')
    parser.add_argument('--sep', type=str, default=';')
    parser.add_argument('--steps', nargs='+', default=None)
    parser.add_argument('--alphabet', type=str, default="""ACGT""") #char_indices: {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    parser.add_argument('--number_of_characters', type=int, default=4)
    parser.add_argument('--extra_characters', type=str, default=[])
    parser.add_argument('--config_path', type=str, default='./config.json')
    parser.add_argument('--size', type=str,
                        choices=['small', 'large'], default='small')
    #if useBOCNGrams is True then set param equals to maxlen variable otherwise choose some int value
    parser.add_argument('--max_length', type=int, default=maxlen)
    # Add a new signal with the use of Conv1D operator over filters rather than over sequence channel
    parser.add_argument('--useSeparatedConv1D', type=bool, default=True)
    # Decide to add LSTM signal or not
    parser.add_argument('--useLSTM', type=bool, default=False)
    # Decide wether to choose a Conv1D operator for dimensionality reduction
    # Or apply AvgPool1D operator (with fixed moving window size determined by para LSTMAvgPoolKernelSize)
    parser.add_argument('--applyConv1DForLSTM', type=bool, default=False)
    parser.add_argument('--setLSTMAvgPoolKernelSize', type=int, default=30)
    # Shuffle dims of LSTM output tensor and peform a Conv1D operation over sequence space channel    
    parser.add_argument('--changeConv1DDirLSTM', type=bool, default=False)
    # Use pretrained SentencePiece tokenizer with Unigram/BPE model 
    parser.add_argument('--useSentencePieceTokenizer', type=bool, default=False)
    parser.add_argument('--data_path_to_SentpBPE', type=str, default="models/BPE/sentpbpe.model")
    # Use pretrained BPE tokenizer with controlable range of ngrams
    parser.add_argument('--useNGramBPETokenizer', type=bool, default=False)
    parser.add_argument('--data_path_to_NGramBPE', type=str, default="models/BPE/bpe.model")
    # Enable word hashing with uni/bi/trigrams tokenization approach
    parser.add_argument('--useBOCNGrams', type=bool, default=True)
    # Embedding usage
    parser.add_argument('--usembedding', type=bool, default=False)
    # Subsitute One-Hot to relevant Embedding on Forward pass after yielding data from DataGenerator
    # Else transform One-Hot to its Embed vector on the stage of Data Loading  
    parser.add_argument('--embedAfterBatches', type=bool, default=False) 
    parser.add_argument('--embedlength', type=int, default=100)
    parser.add_argument('--number_of_classes', type=int, default=4)
    parser.add_argument('--epochs', type=int, default=140)
    parser.add_argument('--batch_size', type=int, default=1024)
    parser.add_argument('--useBatchNormalization', type=bool, default=True)
    parser.add_argument('--optimizer', type=str,
                        choices=['adam', 'sgd'], default='sgd')
    parser.add_argument('--learning_rate', type=float, default=1e-2)
    parser.add_argument('--schedule', type=int, default=20)
    parser.add_argument('--patience', type=int, default=10)
    parser.add_argument('--checkpoint', type=int, choices=[0, 1], default=1)
    parser.add_argument('--workers', type=int, default=8)
    parser.add_argument('--log_path', type=str, default='./logs/')
    parser.add_argument('--output', type=str, default='./models/torch/')
    parser.add_argument('--model_name', type=str, default='char_cnn_bpe')
    

In [None]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

# work-around for Jupyter notebook and IPython console
argv = [] if is_interactive() else sys.argv[1:]
args = parser.parse_args(argv)

In [None]:
# Create output directory in project root
ROOT_DIR = os.path.abspath(os.getcwd())
OUTPUT_DIR = os.path.join(ROOT_DIR, "torch_output")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
## Test Steps

In [None]:
##########################################################
trigram_map, vocab_size=MyDataset._gen_ngrams()
args.trigram_map_len = vocab_size+1

#############################################################
# Calculate vocab size for different pretrained Tokenizers 
#############################################################
if args.useSentencePieceTokenizer:
    sp = spm.SentencePieceProcessor()
    sp.Load(args.data_path_to_SentpBPE)            
    args.vocabSize = sp.GetPieceSize()
elif args.useNGramBPETokenizer:
    bpe = Encoder.load(args.data_path_to_NGramBPE)
    args.vocabSize = bpe.vocab_size
        
model = CharacterLevelLSTMCNN(args)

# args._embedding = model._embedding  

# full_dataset = MyDataset(args)


In [None]:
# input = torch.rand((1024,400,100))
# input = input.to(model.device)
# out = model.forward(input)

In [None]:
# make_dot(out).render("./img/dcnn_graph_view")

In [None]:
# dict(model.named_parameters())['_embedding.Embedding.weight']

In [None]:
# if torch.cuda.is_available():
#     model.cuda()  
# if args.usembedding:    
#     args._embedding = model._embedding.weight.data.cpu().numpy()  
# else:
#     args._embedding = None

# full_dataset = MyDataset(args)
# x, y = full_dataset[100]
# lst = [np.any(x[i]) for i in range(x.shape[0])]
# len(lst) -1 - lst[::-1].index(True)

In [None]:
# from torch.multiprocessing import set_start_method
# try:
#     set_start_method('spawn')
# except RunTimeError:
#     pass

### Train model

In [None]:
run(args)