## Utils

In [2]:
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report

import json
import logging
import os
import shutil

import torch

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
import torch

import re
import contractions
import emoji
import string
from nltk.tokenize import TweetTokenizer
from wordcloud import STOPWORDS
import numpy as np

from nltk.stem.porter import *
stemmer = PorterStemmer()

from nltk.corpus import stopwords
import emoji
import string
from textblob import TextBlob

stopwords = stopwords.words('english')

#Features
def generate_features(df):

    features = pd.DataFrame({'text':df['text']})

    features['emojis'] = features["text"].apply(lambda x: emoji.emoji_count(x))
    features['urls'] = features["text"].apply(lambda x: len(re.findall("URL", str(x))))
    features['hashtags'] = features["text"].apply(lambda x: len(re.findall("#", str(x))))
    features['users'] = features["text"].apply(lambda x: len(re.findall("USER", str(x))))

    features['words'] = features['text'].apply(lambda x: len(str(x).split(" ")))
    features['unique_words'] = features['text'].apply(lambda x: len(set(str(x).split(" "))))
    features['chars'] = features['text'].str.len()
    features['stopwords'] = features['text'].apply(lambda x: len([x for x in x.split() if x in stopwords]))
    features['punctuations'] = features["text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))-features['users']-features['hashtags']
    features['numerics'] = features['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    features['upper'] = features['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))-features['urls']-features['users']
    features['title'] = features['text'].apply(lambda x: len([x for x in x.split() if x.istitle()]))

    features['polarity'] = features['text'].apply(lambda x: np.round(TextBlob(x).sentiment.polarity, 2))
    features['subjectivity'] = features['text'].apply(lambda x: np.round(TextBlob(x).sentiment.subjectivity, 2))

    features = features.drop(columns='text', axis=1)

    return features

#Metrics
def accuracy_recall_precision_f1(y_pred, y_target):

    """Computes the accuracy, recall, precision and f1 score for given predictions and targets
    Args:
        y_pred: Logits of the predictions for each class
        y_target: Target values
    """

    y_pred = y_pred.cpu()
    y_target = y_target.cpu().numpy()

    predictions = torch.argmax(y_pred, dim=1).detach().numpy()
    #predictions = torch.max(y_pred, 1)[1].view(y_target.size()).data

    correct = np.sum(predictions == y_target)
    accuracy = correct / len(predictions)

    recall = recall_score(y_target, predictions, average=None)
    precision = precision_score(y_target, predictions, average=None)
    f1 = f1_score(y_target, predictions, average=None)

    return accuracy, recall, precision, f1

#Preprocess Data
def clean_text(text, remove_punt_number_special_chars=False,remove_stopwords=False, apply_stemming=False):
    """Clean text
    Args:
        text: (str) Text
        remove_punt_number_special_chars: (bool) Remove punctuations, numbers and special characters
        remove_stopwords: (bool) Remove stopwords
        apply_stemming: (bool) Apply stemming on the words on the text
    """
    #Remove emojis
    text = re.sub(":[a-zA-Z\-\_]*:","", emoji.demojize(text)) #:hear-no-evil_monkey:
    text = re.sub(":\w+:","", emoji.demojize(text))
    text = re.sub(":\w+\â€™\w+:","", emoji.demojize(text)) #:woman's_boot:

    #Remove mentions, usernames (@USER)
    text = re.sub("\s*@USER\s*", '', text)

    #Remove URL
    text = re.sub("\s*URL\s*", '', text)

    #And
    text = re.sub("&amp;", "and", text)
    text = re.sub("&lt;", "<", text)
    text = re.sub("&gt", ">", text)
    text = re.sub("&", "and", text)

    #Replace contractions and slang of word
    text = re.sub("i'm", "I'm", text)
    text = contractions.fix(text, slang=True)

    #Lowercase
    text = text.lower()

    #Remove Hashtags + Words
    text = re.sub("#\s*\w+\s*", '', text)

    #Remove repeating whitespaces
    text = re.sub("\s[2, ]"," ", text)

    #Remove non ascii characters
    text.encode("ascii", errors="ignore").decode()

    #Remove punctuations, numbers and special characters (remove emoticons)
    if remove_punt_number_special_chars:
        text = re.sub('[^a-zA-Z]', ' ', text)

    #Tokenize text
    tt = TweetTokenizer(preserve_case=False,
                    strip_handles=True,
                    reduce_len=True)

    text_tokens = tt.tokenize(text)

    #Remove stopwords
    if remove_stopwords:
        stopwords = set(STOPWORDS)
        text_tokens = [token for token in text_tokens if token not in stopwords]

    #Stemming
    if apply_stemming:
        text_stem = [stemmer.stem(token) for token in text_tokens]

    clean = " ".join(text_tokens)

    return clean

#BERT
def convert_examples_to_features(X, y, max_seq_length, tokenizer):

    """Loads a data file and returns examples (input_ids, input_mask, segment_ids, label_id).
    Args:
        data: Data
        max_seq_length: (int) Maximum length of the sequences
        tokenizer: Tokenizer
    """

    col_names = ["input_ids","input_mask","segment_ids","label_id"]
    features = pd.DataFrame(columns=col_names)

    df = pd.DataFrame({"text":X, "label":y})

    for index, example in df.iterrows():

        tokens_text = tokenizer.tokenize(example.text)

        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_text) > max_seq_length - 2:
            tokens_text = tokens_text[:(max_seq_length - 2)]

        tokens = ["[CLS]"] + tokens_text + ["[SEP]"]
        segment_ids = [0] * len(tokens)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))

        input_ids += padding
        input_mask += padding
        segment_ids += padding

        label_id = int(example.label)

        input_features = {'input_ids': input_ids, 'input_mask':input_mask, 'segment_ids':segment_ids, 'label_id':label_id}
        features.loc[len(features)] = input_features

    return features

#Checkpoints save and load
def save_checkpoint(state, directory, checkpoint):

    """Saves model and training parameters at checkpoint
    Args:
        state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict
        checkpoint: (string) folder where parameters are to be saved
    """

    filepath = directory + checkpoint

    if not os.path.exists(directory):
        os.makedirs(directory)

    torch.save(state, filepath)


def load_checkpoint(checkpoint, model, optimizer=None):

    """Loads model parameters (state_dict) from file_path. If optimizer is provided, loads state_dict of
    optimizer assuming it is present in checkpoint.
    Args:
        checkpoint: (string) filename which needs to be loaded
        model: (torch.nn.Module) model for which the parameters are loaded
        optimizer: (torch.optim) optional: resume optimizer from checkpoint
    """

    if not os.path.exists(checkpoint):
        raise ("File doesn't exist {}".format(checkpoint))
    checkpoint = torch.load(checkpoint)
    model.load_state_dict(checkpoint['state_dict'])

    if optimizer:
        optimizer.load_state_dict(checkpoint['optim_dict'])


## Load Data

In [3]:
import json
from pathlib import Path
from collections import OrderedDict

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
import torch

import re
import contractions
import emoji
import string
from nltk.tokenize import TweetTokenizer
from wordcloud import STOPWORDS
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data

import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

from utils import convert_examples_to_features
from utils import clean_text

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
from pytorch_pretrained_bert.tokenization import BertTokenizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from utils import generate_features

def load_glove(embedding_file):

    """Load GloVe file
    Args:
        embedding_file: (str) Directory of the embedding file
    """

    EMBEDDING_FILE = embedding_file
    embeddings_index = dict()

    for line in open(EMBEDDING_FILE):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    print("Loaded {} word vectors".format(len(embeddings_index)))

    return embeddings_index

def create_weight_matrix(vocab_size, word_index, embedding_dim, embeddings_index):

    """Create weight matrix for the embeddings
    Args:
        vocab_size: Vocabulary size
        word_index: Word index
        embedding_dim: Dimension of the embeddings
        embeddings_index: Index of the embedding
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)

        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())

            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

    return embedding_matrix

#Load data
def load_data():
    """
    Loads the data, if the data is not splitted yet the data will be split in a train and val set
    """

    RANDOM_STATE = 123

    train_file = Path("data/train.csv")

    if train_file.exists():
        train = pd.read_csv("data/train.csv")
        val = pd.read_csv("data/val.csv")
        test = pd.read_csv("data/test.csv")
    else:
        # Split normal data
        train_cola = pd.read_csv("data/SemEval/olid-training-v1.0.tsv", delimiter="\t")
        test_cola = pd.read_csv("data/SemEval/testset-levela.tsv", delimiter="\t")
        labels_cola = pd.read_csv("data/SemEval/labels-levela.csv", header=None)
        labels_cola.columns = ['id', 'subtask_a']

        test = pd.merge(test_cola, labels_cola, on='id')

        # Remove duplicates
        train_cola = train_cola.drop_duplicates("tweet")
        test = test.drop_duplicates("tweet")

        train, val = train_test_split(train_cola, test_size=0.2, random_state=RANDOM_STATE)
        train.reset_index(drop=True)
        val.reset_index(drop=True)

        train = train[["tweet","subtask_a"]]
        val = val[["tweet","subtask_a"]]
        test = test[["tweet","subtask_a"]]

        train.columns = ['text', 'label']
        val.columns = ['text','label']
        test.columns = ['text', 'label']

        train.to_csv("data/train.csv", index=False)
        val.to_csv("data/val.csv", index=False)
        test.to_csv("data/test.csv", index=False)

    return train, val, test

def load_data_features():
    """
    Loads the data, if the data is not splitted yet the data will be split in a train and val set
    """

    RANDOM_STATE = 123

    train_file = Path("data/train.csv")

    if train_file.exists():
        train = pd.read_csv("data/train.csv")
        val = pd.read_csv("data/val.csv")
        test = pd.read_csv("data/test.csv")

        features_train = pd.read_csv("data/features_train.csv")
        features_val = pd.read_csv("data/features_val.csv")
        features_test = pd.read_csv("data/features_test.csv")

    else:
        # Split normal data
        train_cola = pd.read_csv("data/SemEval/olid-training-v1.0.tsv", delimiter="\t")
        test_cola = pd.read_csv("data/SemEval/testset-levela.tsv", delimiter="\t")
        labels_cola = pd.read_csv("data/SemEval/labels-levela.csv", header=None)
        labels_cola.columns = ['id', 'subtask_a']

        test = pd.merge(test_cola, labels_cola, on='id')

        # Remove duplicates
        train_cola = train_cola.drop_duplicates("tweet")
        test = test.drop_duplicates("tweet")

        train, val = train_test_split(train_cola, test_size=0.2, random_state=RANDOM_STATE)
        train.reset_index(drop=True)
        val.reset_index(drop=True)

        train = train[["tweet","subtask_a"]]
        val = val[["tweet","subtask_a"]]
        test = test[["tweet","subtask_a"]]

        train.columns = ['text', 'label']
        val.columns = ['text','label']
        test.columns = ['text', 'label']

        # Generate features
        features_train = generate_features(train)
        features_val = generate_features(val)
        features_test = generate_features(test)

        train.to_csv("data/train.csv", index=False)
        val.to_csv("data/val.csv", index=False)
        test.to_csv("data/test.csv", index=False)

        features_train.to_csv("data/features_train.csv")
        features_val.to_csv("data/features_val.csv")
        features_test.to_csv("data/features_test.csv")

    return train, val, test, features_train, features_val, features_test

#Clean Data
def clean_data(df, remove_punt_number_special_chars=False,remove_stopwords=False, apply_stemming=False):
    """Clean the data and remove data which has a length of less than 3 words
    Args:
        df: Dataframe
    """

    labels = encode_label(df["label"])
    text_clean = [clean_text(text, remove_punt_number_special_chars,remove_stopwords, apply_stemming) for text in df["text"]]

    df = pd.DataFrame({"text":text_clean, "label":labels})

    return text_clean, labels

#Get Dataloader
def get_dataloader(examples, batch_size):
    """Make data iterator
        Arguments:
            X: Features
            y: Labels
            batch_size: (int) Batch size
    """

    all_input_ids = torch.tensor(list(examples.input_ids), dtype=torch.long)
    all_input_mask = torch.tensor(list(examples.input_mask), dtype=torch.long)
    all_segment_ids = torch.tensor(list(examples.segment_ids), dtype=torch.long)
    all_label_ids = torch.tensor(list(examples.label_id), dtype=torch.long)

    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader

def make_iterator(X, y, batch_size):

    """Make iterator for a given X and y and batch size
    Args:
        X: X vector
        y: y vector
        batch_size: (int) Batch size
    """

    X = torch.tensor(X, dtype=torch.long)
    y = torch.tensor(y, dtype=torch.float32)
    ds = TensorDataset(X, y)
    loader = DataLoader(ds, batch_size=batch_size)

    return loader

def make_iterator_features(X, features, y, batch_size):

    """Make iterator for a given X and y and batch size
    Args:
        X: X vector
        y: y vector
        batch_size: (int) Batch size
    """

    X = torch.tensor(X, dtype=torch.long)
    features = torch.tensor(list(features), dtype=torch.long)
    y = torch.tensor(y, dtype=torch.float32)
    ds = TensorDataset(X, features, y)
    loader = DataLoader(ds, batch_size=batch_size)

    return loader

def encode_label(y):

    """Encode labels from str to numbers
    Args:
        y: y vector
    """

    y = y.values
    le = LabelEncoder()
    le.fit(y)

    return np.array(le.transform(y))

def get_data_bert(max_seq_length, batch_sizes):

    """
    Args:
        max_num_words: (int) Max number of words as input for the Tokenizer
        embedding_dim: (int) Embedding dim of the embeddings
        max_seq_length: (int) Max sequence length of the sentences
        batch_size: (int) Batch size for the DataLoader
        use_bert: (bool) Use the BERT model or another model
    Output:
        word_index, embedding_matrix, X_train, y_train, X_test, y_test
    """

    #Load data
    train, val, test = load_data()

    #Clean data

    X_train, y_train = clean_data(train)
    X_val, y_val = clean_data(val)
    X_test, y_test = clean_data(test)

    #Features data
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    train_examples = convert_examples_to_features(X_train, y_train, max_seq_length, tokenizer)
    val_examples = convert_examples_to_features(X_val, y_val, max_seq_length, tokenizer)
    test_examples = convert_examples_to_features(X_test, y_test, max_seq_length, tokenizer)

    #Data loaders
    train_dataloader = get_dataloader(train_examples, batch_sizes[0])
    val_dataloader = get_dataloader(val_examples, batch_sizes[1])
    test_dataloader = get_dataloader(test_examples, batch_sizes[2])

    return train_dataloader, val_dataloader, test_dataloader



Using TensorFlow backend.


## Models

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForSequenceClassification

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np

class BertLinear(nn.Module):
    def __init__(self, hidden_dim, dropout, output_dim):
        """
        Args:
            hidden_dim: Size hiddden state
            dropout: Dropout probability
            output_dim: Output dimension (number of labels)
        """

        super(BertLinear, self).__init__()
        self.output_dim = output_dim
        self.dropout = dropout

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.relu = nn.LeakyReLU()

        self.linear1 = nn.Linear(768, hidden_dim) #self.bert.config.hidden_size = 768
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, output_dim)


    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):

        encoded_layers, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)

        x = self.relu(self.linear1(pooled_output))
        x = self.dropout(x)
        x = self.relu(self.linear2(x))
        x = self.relu(self.linear3(x))

        return x

class BertLSTM(nn.Module):
    def __init__(self, hidden_dim, dropout, output_dim):
        """
        Args:
            dropout: Dropout probability
            output_dim: Output dimension (number of labels)
        """

        super(BertLSTM, self).__init__()
        self.output_dim = output_dim
        self.dropout = dropout

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)

        self.lstm = nn.LSTM(768, hidden_dim, bidirectional=True) #self.bert.config.hidden_size = 768
        self.output = nn.Linear(hidden_dim * 2, output_dim)


    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        encoded_layers, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        encoded_layers = encoded_layers.permute(1, 0 ,2)

        output, (hidden_state, cell_state) = self.lstm(encoded_layers)

        out = torch.cat((hidden_state[0], hidden_state[1]), dim=1)

        out = self.dropout(out)

        x = self.output(out)

        return x

## Train

In [None]:
import argparse
import logging
import os

import numpy as np
import torch
import torch.optim as optim
from tqdm import tqdm, trange

from utils import accuracy_recall_precision_f1

import pandas as pd

def train_model(model, optimizer, loss_fn, dataloader, device, use_bert):
    """Train model
    Args:
        model: Model either LSTM, LSTMAttention, CNN, MLP (torch.nn.Module)
        optimizer: Optimizer for parameters of the model (torch.optim)
        loss_fn: Loss function taht computs the loss for each batch based on the y_pred and y_target
        dataloader: Dataloader that generates batches of data and labels or in case of BERT input_ids, input_mask, segment_ids and label_ids
        device: Device run either on GPU or CPU
    """

    #Metrics
    epoch_loss = 0
    epoch_accuracy = 0
    epoch_recall = [0, 0]
    epoch_precision = [0, 0]
    epoch_f1 = [0, 0]

    #Set model in training mode
    model.train()

    for step, batch in enumerate(tqdm(dataloader, desc="Iteration")):

        #Step 0: Get batch
        batch = tuple(t.to(device) for t in batch)

        if use_bert:
            input_ids, input_mask, segment_ids, label_ids = batch
        else:
            X, y_target = batch
            y_target = torch.autograd.Variable(y_target).long()

        #Step 1: Clear the gradients
        optimizer.zero_grad()

        #Step 2: Compute the forward pass of the model (model output)
        if use_bert:
            y_pred = model(input_ids, segment_ids, input_mask, labels=None)
            y_target = label_ids
        else:
            y_pred = model(X)

        #Step 3: Compute the loss
        loss = loss_fn(y_pred, y_target)
        loss_batch = loss.item()
        epoch_loss += loss_batch

        #Step 4: Propagate the loss backward
        loss.backward()

        #Step 5: Use optimizer to take gradient step
        optimizer.step()

        #Compute other metrics
        accuracy, recall, precision, f1 = accuracy_recall_precision_f1(y_pred, y_target)

        epoch_accuracy += accuracy
        epoch_recall += recall
        epoch_precision += precision
        epoch_f1 += f1

    #Train results
    results = {
        'loss': np.round(epoch_loss / len(dataloader),2),
        'accuracy': np.round(float(epoch_accuracy / len(dataloader)),2),
        'recall': np.round(epoch_recall / len(dataloader), 2),
        'precision': np.round(epoch_precision / len(dataloader), 2),
        'f1': np.round(epoch_f1 / len(dataloader), 2)
    }

    return results

## Evaluate

In [None]:
import argparse
import logging
import os

import numpy as np
import torch
import torch.optim as optim
from tqdm import tqdm, trange

from utils import accuracy_recall_precision_f1

import pandas as pd

def evaluate_model(model, optimizer, loss_fn, dataloader, device, use_bert):
    """Evaluate model
    Args:
        model: Model either LSTM, LSTMAttention, CNN, MLP (torch.nn.Module)
        optimizer: Optimizer for parameters of the model (torch.optim)
        loss_fn: Loss function taht computs the loss for each batch based on the y_pred and y_target
        dataloader: Dataloader that generates batches of data and labels or in case of BERT input_ids, input_mask, segment_ids and label_ids
        device: Device run either on GPU or CPU
    """

    #Metrics
    epoch_loss = 0
    epoch_accuracy = 0
    epoch_recall = [0, 0]
    epoch_precision = [0, 0]
    epoch_f1 = [0, 0]

    #Set model in evaluate mode
    model.eval()

    with torch.no_grad():

        for step, batch in enumerate(tqdm(dataloader, desc="Iteration")):

            #Step 0: Get batch
            batch = tuple(t.to(device) for t in batch)

            if use_bert:
                input_ids, input_mask, segment_ids, label_ids = batch
            else:
                X, y_target = batch
                y_target = torch.autograd.Variable(y_target).long()

            #Step 1: Compute the forward pass of the model (model output)
            if use_bert:
                y_pred = model(input_ids, segment_ids, input_mask, labels=None)
                y_target = label_ids
            else:
                y_pred = model(X)

            #Step 2: Compute the loss
            loss = loss_fn(y_pred, y_target)
            loss_batch = loss.item()
            epoch_loss += loss_batch

            #Compute other metrics
            accuracy, recall, precision, f1 = accuracy_recall_precision_f1(y_pred, y_target)

            epoch_accuracy += accuracy
            epoch_recall += recall
            epoch_precision += precision
            epoch_f1 += f1

        #Evaluation results
        results = {
            'loss': np.round(epoch_loss / len(dataloader),2),
            'accuracy': np.round(float(epoch_accuracy / len(dataloader)),2),
            'recall': np.round(epoch_recall / len(dataloader), 2),
            'precision': np.round(epoch_precision / len(dataloader), 2),
            'f1': np.round(epoch_f1 / len(dataloader), 2)
        }

    return results

## Main

In [None]:
import json
import logging
import os
import shutil

import numpy as np
import pandas as pd
import logging

import click

import torch
from torch import nn
from torch.nn import functional as F
import torch.optim as optim
from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

from tqdm import tqdm, trange

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from train import train_model
from evaluate import evaluate_model
from utils import accuracy_recall_precision_f1, save_checkpoint, load_checkpoint
from data_loader import get_data, get_data_bert
import models

import warnings
warnings.filterwarnings('ignore')

#Sacred
#Sources
#https://github.com/gereleth/kaggle-telstra/blob/master/Automatic%20model%20tuning%20with%20Sacred%20and%20Hyperopt.ipynb
#https://github.com/maartjeth/sacred-example-pytorch
from sacred import Experiment
from sacred.observers import FileStorageObserver
from sacred.observers import MongoObserver
from sacred.observers import SlackObserver
from sacred.utils import apply_backspaces_and_linefeeds

EXPERIMENT_NAME = 'experiment'
DATABASE_NAME = 'experiments'
URL_NAME = 'mongodb://localhost:27017/'

ex = Experiment()
ex.observers.append(FileStorageObserver.create('results'))
#ex.observers.append(MongoObserver.create(url=URL_NAME, db_name=DATABASE_NAME))
ex.captured_out_filter = apply_backspaces_and_linefeeds

#Send a message to slack if the run is succesfull or if it failed
slack_obs = SlackObserver.from_config('slack.json')
ex.observers.append(slack_obs)

#Device
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

def log_scalars(results, name_dataset):

    """Log scalars of the results for MongoDB and Omniboard
    Args:
        results: Results with the loss, accuracy, recall, precision and f1-score
        name_dataset: The name of the dataset so it can store the scalers by name
    """

    ex.log_scalar(name_dataset+'.loss', float(results['loss']))
    ex.log_scalar(name_dataset+'.accuracy', float(results['accuracy']))
    ex.log_scalar(name_dataset+'.recall.OFF', float(results['recall'][0]))
    ex.log_scalar(name_dataset+'.recall.NOT', float(results['recall'][1]))
    ex.log_scalar(name_dataset+'.precision.OFF', float(results['precision'][0]))
    ex.log_scalar(name_dataset+'.precision.NOT', float(results['precision'][1]))
    ex.log_scalar(name_dataset+'.f1.OFF', float(results['f1'][0]))
    ex.log_scalar(name_dataset+'.f1.NOT', float(results['f1'][1]))


@ex.capture
def train_and_evaluate(num_epochs, model, optimizer, loss_fn, train_dataloader, val_dataloader, early_stopping_criteria, directory, use_bert, use_mongo):

    """Train on training set and evaluate on evaluation set
    Args:
        num_epochs: Number of epochs to run the training and evaluation
        model: Model
        optimizer: Optimizer
        loss_fn: Loss function
        dataloader: Dataloader for the training set
        val_dataloader: Dataloader for the validation set
        scheduler: Scheduler
        directory: Directory path name to story the logging files

    Returns train and evaluation metrics with epoch, loss, accuracy, recall, precision and f1-score
    """

    train_metrics = pd.DataFrame(columns=['epoch', 'loss', 'accuracy', 'recall', 'precision', 'f1'])
    val_metrics = pd.DataFrame(columns=['epoch', 'loss', 'accuracy', 'recall', 'precision', 'f1'])

    best_val_loss = float("inf")

    early_stop_step = 0

    for epoch in trange(num_epochs, desc="Epoch"):

        ### TRAINING ###
        train_results = train_model(model, optimizer, loss_fn, train_dataloader, device, use_bert)
        train_metrics.loc[len(train_metrics)] = {'epoch':epoch, 'loss':train_results['loss'], 'accuracy':train_results['accuracy'], 'recall':train_results['recall'], 'precision':train_results['precision'], 'f1':train_results['f1']}
        if use_mongo: log_scalars(train_results, "Train")

        ### EVALUATION ###
        val_results = evaluate_model(model, optimizer, loss_fn, val_dataloader, device, use_bert)
        val_metrics.loc[len(val_metrics)] = {'epoch':epoch, 'loss':val_results['loss'], 'accuracy':val_results['accuracy'], 'recall':val_results['recall'], 'precision':val_results['precision'], 'f1':val_results['f1']}
        if use_mongo: log_scalars(val_results, "Validation")

        #Save best and latest state
        best_model = val_results['loss'] < best_val_loss
        #last_model = epoch == num_epochs-1

        if best_model:
            save_checkpoint({'epoch': epoch+1,
                                   'state_dict': model.state_dict(),
                                   'optim_dict': optimizer.state_dict()},
                                    directory=directory,
                                    checkpoint='best_model.pth.tar')

        # if last_model:
        #     save_checkpoint({'epoch': epoch+1,
        #                            'state_dict': model.state_dict(),
        #                            'optim_dict': optimizer.state_dict()},
        #                             directory=directory,
        #                             checkpoint='last_model.pth.tar')

        #Early stopping
        if val_results['loss'] >= best_val_loss:
            early_stop_step += 1
            print("Early stop step:", early_stop_step)
        else:
            best_val_loss = val_results['loss']
            early_stop_step = 0

        stop_early = early_stop_step >= early_stopping_criteria

        if stop_early:
            print("Stopping early at epoch {}".format(epoch))

            #Save last model when stop early
            # save_checkpoint({'epoch': epoch+1,
            #                        'state_dict': model.state_dict(),
            #                        'optim_dict': optimizer.state_dict()},
            #                         directory=directory,
            #                         checkpoint='last_model.pth.tar')

            return train_metrics, val_metrics

        print('\n')
        print('Train Loss: {} | Train Acc: {}'.format(train_results['loss'], train_results['accuracy']))
        print('Valid Loss: {} | Valid Acc: {}'.format(val_results['loss'], val_results['accuracy']))
        print('Train recall: {} | Train precision: {}'.format(train_results['recall'], train_results['precision']))
        print('Valid recall: {} | Valid precision: {}'.format(val_results['recall'], val_results['precision']))

        #Scheduler
        #scheduler.step()

    return train_metrics, val_metrics


@ex.config
def config():

    """Configuration"""

    output_dim = 2 #Number of labels (default=2)
    train_bs = 32.0 #Train batch size (default=32)
    val_bs = 32.0 #Validation batch size (default=32)
    test_bs = 32.0  #Test batch size (default=32)
    num_epochs = 100 #Number of epochs (default=100)
    max_seq_length = 45 #Maximum sequence length of the sentences (default=40)
    learning_rate = 3e-5 #Learning rate for the model (default=3e-5)
    warmup_proportion = 0.1 #Warmup proportion (default=0.1)
    early_stopping_criteria = 50 #Early stopping criteria (default=5)
    num_layers = 2 #Number of layers (default=2)
    hidden_dim = 128 #Hidden layers dimension (default=128)
    bidirectional = False #Left and right LSTM
    dropout = 0.1 #Dropout percentage
    filter_sizes = [2, 3, 4] #CNN
    embedding_file = 'data/GloVe/glove.twitter.27B.200d.txt' #Embedding file
    model_name = "MLP" #Model name: LSTM, BERT, MLP, CNN
    use_mongo = True

@ex.automain
def main(output_dim,
        train_bs,
        val_bs,
        test_bs,
        num_epochs,
        max_seq_length,
        learning_rate,
        warmup_proportion,
        early_stopping_criteria,
        num_layers,
        hidden_dim,
        bidirectional,
        dropout,
        filter_sizes,
        embedding_file,
        model_name,
        use_mongo,
        _run):

    #Logger
    #directory = f"results/checkpoints/{_run._id}/"
    directory = f"results/{_run._id}/"

    #Batch sizes
    batch_sizes = [int(train_bs), int(val_bs), int(test_bs)]
    batch_size = int(train_bs)

    if "BERT" in model_name:  #Default = False, if BERT model is used then use_bert is set to True
        use_bert = True
    else:
        use_bert = False

    #Data
    if use_bert:
        train_dataloader, val_dataloader, test_dataloader = get_data_bert(int(max_seq_length), batch_sizes)
    else:
        embedding_dim, vocab_size, embedding_matrix, train_dataloader, val_dataloader, test_dataloader = get_data(int(max_seq_length), embedding_file=embedding_file, batch_size=batch_size)

    #Model
    if model_name=="MLP":
        model = models.MLP(embedding_matrix, embedding_dim, vocab_size, int(hidden_dim), dropout, output_dim)
    if model_name=="MLP_Features":
        model = models.MLP_Features(embedding_matrix, embedding_dim, vocab_size, int(hidden_dim), 14, dropout, output_dim)
        print(model)
    elif model_name=="CNN":
        model = models.CNN(embedding_matrix, embedding_dim, vocab_size, dropout, filter_sizes, output_dim)
        print(model)
    elif model_name=="LSTM":
        model = models.LSTM(embedding_matrix, embedding_dim, vocab_size, int(hidden_dim), dropout, int(num_layers), bidirectional, output_dim)
        print(model)
    elif model_name=="LSTMAttention":
        model = models.LSTMAttention(embedding_matrix, embedding_dim, vocab_size, int(hidden_dim), dropout, int(num_layers), bidirectional, output_dim)
        print(model)
    elif model_name=="BERT":
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased", output_dim)
        print(model)
    elif model_name=="BERTLinear":
        model = models.BertLinear(hidden_dim, dropout, output_dim)
        print(model)
    elif model_name=="BERTLSTM":
        model = models.BertLSTM(hidden_dim, dropout, output_dim)
        print(model)

    model = model.to(device)

    #Loss and optimizer
    #optimizer = optim.Adam([{'params': model.parameters(), 'weight_decay': 0.1}], lr=learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = F.cross_entropy

    #Scheduler
    #scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5, 50], gamma=0.1)

    #Training and evaluation
    print('Training and evaluation for {} epochs...'.format(num_epochs))
    train_metrics, val_metrics = train_and_evaluate(num_epochs, model, optimizer, loss_fn, train_dataloader, val_dataloader, early_stopping_criteria, directory, use_bert, use_mongo)
    train_metrics.to_csv(directory+"train_metrics.csv"), val_metrics.to_csv(directory+"val_metrics.csv")

    #Test
    print('Testing...')
    load_checkpoint(directory+"best_model.pth.tar", model)

    #Add artifacts
    #ex.add_artifact(directory+"best_model.pth.tar")
    #ex.add_artifact(directory+"last_model.pth.tar")

    test_metrics = evaluate_model(model, optimizer, loss_fn, test_dataloader, device, use_bert)
    if use_mongo: log_scalars(test_metrics,"Test")

    test_metrics_df = pd.DataFrame(test_metrics)
    test_metrics_df = pd.DataFrame(test_metrics, index=["NOT","OFF"])
    print(test_metrics)
    test_metrics_df.to_csv(directory+"test_metrics.csv")

    id_nummer = f'{_run._id}'

    results = {
        'id': id_nummer,
        'loss': np.round(np.mean(val_metrics['loss']), 4),
        'accuracy': test_metrics['accuracy'],
        'recall': test_metrics['recall'],
        'precision': test_metrics['precision'],
        'f1': test_metrics['f1'],
        'learning_rate': learning_rate,
        'hidden_dim': hidden_dim,
        'status': 'ok'
    }

    return results