In [1]:
from sklearn.model_selection import LeaveOneOut,KFold, StratifiedKFold
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import plot_precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot as plt
#from google.colab import drive
from keras import backend as K
from random import shuffle
import itertools as tools
import tensorflow as tf
import pickle as pkl
from os import walk
import pandas as pd
import numpy as np
import ast
import gc

%matplotlib inline
#drive.mount('/content/gdrive', force_remount=True)
dataset_path = './public/processed data/csv data/'

Using TensorFlow backend.


### Helper functions 


Pre-processor: includes all functions that handle tokenization and word embeddings part of processing



In [None]:
class Preprocessor:
    
    def __init__(self, embidding_dims= 100, max_sequence_length = 100, padding_type='post'):
        
        self.tokenizer = None
        self.embeddings_matrix = None
        self.embedding_dim = embidding_dims
        self.max_sequence_length = max_sequence_length
        self.padding_type = padding_type
        
    def __make_tokenizer(self, text, oov_token='<OOV>'):
        '''
            make tokenizer from sentences
            
            -- inputs:
                sentences:text to fit tokenizer on
                oov_token: out of vocabulary token 
            -- returns:
                None
        '''
        self.tokenizer = Tokenizer(oov_token=oov_token)
        self.tokenizer.fit_on_texts(text)    
    
    def load_tokenizer(self, file_path):
        '''
            load tokenizer from pickle file
            
            -- inputs:
                file_path: path to the file of tokenizer 
            -- returns:
                None
        '''
        try:
            self.tokenizer = pkl.load(open(file_path,'rb'))
        except  Exception  as e :
            print(f'could not load tokenizer from path: {file_path}\n{e}')
    
    
    def make_tokenizer(self, dataset_path, save_tokenizer=False, tokenizer_file_path=None):
        '''
            make a tokenizer from separate files
            
            -- inputs:
                file_path: path to the file of tokenizer 
            -- returns:
                None
        '''
        df = pd.read_csv(f'{dataset_path}', sep=',')
        
        self.tokenizer = self.__make_tokenizer(np.array(dataframe['data']))

        if save_tokenizer == True:
            try:
                pkl.dump(tokenizer, open(f'{tokenizer_file_path}', 'wb'))
            except Exception  as e :
                print(f'could not save tokenizer to path: {tokenizer_file_path}\n{e}')
    
    def make_embeddings(self, path_to_embeddings= dataset_path + 'glove.6B.100d.txt'):
        
        if self.tokenizer == None:
            print('could not create embeddings matrix from empty tokenizer')
        
        else:  
            embeddings_index = {}
            vocab_size=len(self.tokenizer.word_index)

            with open(f'{path_to_embeddings}', encoding="utf8") as f:
                for line in f:
                    values = line.split()
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs

            embedding_index = np.zeros((vocab_size+1, self.embedding_dim))
            for word, i in self.tokenizer.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_index[i] = embedding_vector

            self.embeddings_matrix = embedding_index
        
    def tokenize_data(self, paragraphs):          
        sequence = self.tokenizer.texts_to_sequences(paragraphs)
        padded_sequence = pad_sequences(sequence, padding=self.padding_type,maxlen=self.max_sequence_length)
        return padded_sequence        
        

### Data transformer: includes all functions that handle datsaet manipulations


*   generate n-permutations of file
*   clean up non-ascii characters from text line
*   read file from directory
*   generate paragraph-label pairs from data directory
*   remove non-ascii characters from text line
*   generate paragraphs of n-sentences from file
*   generate paragraphs of n-sentences from file








In [None]:
class DataTransformer:
    
    def __init__(self):
        pass
    
    def __permute_file(self, file_lines, size=10):
        '''
            returns all permutations of a file

            -- inputs:
                file_lines: list of file lines
                size: number of permutations to return, other than the original one
            -- returns:
                all permutations of the list element where the original ordering is the first element
        '''
        file_lines = np.array(file_lines)
        indeces = [i for i in range(file_lines.shape[0])]
        shuffled_set = set() 
        while len(shuffled_set) <size:
            shuffle(indeces)
            shuffled_set.add(tuple(indeces))

        permuted_lines = [file_lines]
        for idx in shuffled_set:
            permuted_lines.append(file_lines[list(idx)])

        return permuted_lines

    def __remove_non_ascii(self, text):
        return ''.join([i if ord(i) < 128 and i not in ['.', '\n', ','] else '' for i in text])

    def __read_file(self, file_path, skip_first_token=False):
        '''
            reads the lines from file

            -- inputs: 
                file_path: full path to the file to be permuted
                skip_first_token: boolean used to sanitize the inputs from the DUC dataset
            -- returns:
                list of the file lines
        '''
        #try:
        with open(file_path, 'r') as file:
            if skip_first_token == True:
                return [ self.__remove_non_ascii(line[line.find(' ') + 1:]) for line in file.readlines() ]
            return [ self.__remove_non_ascii(line) for line in file.readlines()]
        #except:
            '''file is either not found or path is wrong
            print(f"file {file_path} was not found!")
            return []'''

    def __get_file_labeled_permutations(self, file_path):
        '''
            generate all permutations of the file, label the original one as coherent, all other permutations are non-coherent

            -- inputs: 
                file_path: full path to the file to be permuted
            -- returns: 
                list of tuples of structure: (lines permutation, label: being 1 for coherent, 0 for non-coherent)
        '''
        file_lines = self.__read_file(file_path, skip_first_token=True)
        permuted_lines = self.__permute_file(file_lines, size=20)
        labels = [1] + [0 for i in range(len(permuted_lines)-1)]
        return zip(permuted_lines, labels)

    @staticmethod
    def generate_file_cliques(file_lines, size=3):

        '''
        divides the file into cliques of similar size

        -- inputs: 
            file_lines: list of file lines to be permuted
            size: size of cliques 
        -- returns:
            list of cliques generated
        '''
        cliques = []
        for idx in range(len(file_lines)-size):
            current_clique = []
            for increment in range(size):
                current_clique.append(file_lines[idx+increment])
            cliques.append(current_clique)

        return cliques
    
    def generate_separate_files_method(self, dataset_path, clique_size = 3):
        '''
            file structure is: n+1 lines where the first line is either 1 for coherent documents, 0 for non coherent
                followed by n-lines of the document.
        '''
        count = 0
        print(f'reading files from {dataset_path}')
        for (_, _, file_names) in walk(dataset_path):
            for file_name in file_names:
                for file_lines, label in self.__get_file_labeled_permutations(dataset_path+file_name):
                    for clique in DataTransformer.generate_file_cliques(file_lines, size=clique_size):
                        with open(f'./processed data/separate-files/{count}.txt','w') as file:
                            file.write(f'{label}. ')
                            for line in clique:
                                file.write(f'{line}. ')
                        count += 1
                        
    def generate_csv_dataset_from_separate_files (self, dataset_path, file_name):
        
        with open(f'{dataset_path}{file_name}', 'a') as csv_file:
            csv_file.write('data,label\n')

            for (_, _, file_names) in walk(dataset_path):
                for file_name in file_names:
                    file_lines = open(f'{dataset_path}{file_name}', 'r').readline().split('.')[:-1]
                    if len(file_lines) != 4:
                        print(file_name)
                    else:
                        for line in file_lines[1:]:
                            csv_file.write(f'{line}. ')
                        csv_file.write(f',{file_lines[0]}\n')


### Model: includes all model related functionalities



Similarity matrix:
trainable matrix M that captures similarity between two sentences according to the equation: 

 ![similarity function](https://drive.google.com/uc?id=1y_ojFiDHkrbOwi7LEXrGo2UTXo4Qbv5o)

where: 
*   Xf: first sentence
*   Xs: second sentence
*   M: similarity matrix (trainable weights)






In [None]:
class SimilarityMatrix(tf.keras.layers.Layer):

    def __init__(self,dims, **kwargs):
        self.dims_length, self.dims_width = dims
        super(SimilarityMatrix, self).__init__(**kwargs)

    def build(self, input_shape):
        
        # Create a trainable weight variable for this layer.
        self._m = self.add_weight(name='M', 
                                    shape=(self.dims_length,self.dims_width),
                                    initializer='uniform',
                                    trainable=True)
        super(SimilarityMatrix, self).build(input_shape)  # Be sure to call this at the end

    def call(self, y): 
        xf, xs = y
        sim1=tf.matmul(xf, self._m)
        transposed = tf.reshape(K.transpose(xs),(-1, 100, 1))
        sim2=tf.matmul(sim1, transposed)
        return sim2

    def compute_output_shape(self, input_shape):
        return (1)


    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'dims_length': self.dims_length, 
            'dims_width': self.dims_width
        })
        return config

Model Helper: includes all training related functions

In [None]:
class ModelHelper:
  
    @staticmethod
    def negative_log_likelihood(y_true, y_pred):
        
        '''
          Calculates negative log likelihood

          -- inputs: 
              y_true: ground truth  values
              y_predictions: non categorical predicted values
              y_pred: (optional) title for the plot
          -- returns:
              negative likelihood total loss 
        '''
        return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)

    @staticmethod
    def plot_ROC(y_true, y_predictions, title=''):

        '''
          Plot ROC curve
          
          -- inputs: 
              y_true: ground truth  values
              y_predictions: non categorical predicted values
              title: (optional) title for the plot
          -- returns:
              None
        '''

        ## calculate the FPR, TPR, Thresholds and AUC value
        false_pos_rate, true_pos_rate, thresholds = roc_curve(y_true, y_predictions)
        auc_val = auc(false_pos_rate, true_pos_rate)

        ## plot ROC curve
        plt.figure(1)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(false_pos_rate, true_pos_rate, label=f'{title}' +' (area = {:.3f})'.format(auc_val))
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title('ROC curve')
        plt.legend(loc='best')
        plt.show()


    @staticmethod
    def plot_precision_recall(model, X_data, y_true):
      
        '''
          Plot precision-recall curve

          -- inputs: 
              model: model for which curve is plotted
              X_data: features to predict 
              y_true: ground truth  values
          -- returns:
              None
        '''
        pred = model.predict(X_data).ravel()

        average_precision = average_precision_score(y_true, pred)

        disp = plot_precision_recall_curve(model, X_data, y_true)
        disp.ax_.set_title('binary Precision-Recall curve: ' + 'AP={0:0.2f}'.format(average_precision))

    @staticmethod
    def compile_model(model , loss_func, monitor_metrics = ['acc'], optimizer='adam'):
      
        '''
            Compile model
            
            -- inputs: 
                model: model to compile
                loss_func: loss function to be used
                monitor_metrics: (optional) metrics to be monitored 
                optimizer: (optional) optimizer to use, adam is the default
            -- returns:
                None
        '''
        model.compile(optimizer=optimizer, loss=loss_func, metrics=monitor_metrics)   

    @staticmethod
    def train_model_kfolds(data, model_class, loss_func, num_of_folds, verbose=2, batch_size=128, plot_roc = False, plot_prec_recall = False ):
        
        model_callbacks = [
            tf.keras.callbacks.EarlyStopping(patience=2),
            #tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
            #tf.keras.callbacks.TensorBoard(log_dir=r'.\ogs') 
        ]
        
        X_data, y_data = data[0].astype(np.float32), data[1].astype(np.float32)
        
        count = 0

        for train_index, test_index in StratifiedKFold(n_splits=num_of_folds, shuffle=True, random_state=42).split(X_data, y_data):
            
            X_train, X_test = X_data[train_index], X_data[test_index]
            
            y_train, y_test = y_data[train_index], y_data[test_index]
            
            model = model_class()
            model.make_model()
            model = model.model

            ModelHelper.compile_model(model, ModelHelper.negative_log_likelihood)

            model.fit(X_train,y_train,validation_data=(X_test,y_test),verbose=verbose,epochs=20, batch_size=batch_size, callbacks=model_callbacks)
            

            pred = model.predict(X_test).ravel()
            
            loss, acc = model.evaluate(X_test, y_test, batch_size=batch_size)

            print(f'fold #{count+1} test loss: {loss}, test acc: {acc}')

            average_precision = average_precision_score(y_test, pred)

            print('Average precision-recall score: {0:0.2f}'.format(average_precision))
            
            if plot_roc:
              ModelHelper.plot_ROC(y_test, pred, 'test data')
            if plot_prec_recall:
              ModelHelper.plot_precision_recall(model, X_test, y_test)  

            count += 1
            gc.collect()

    @staticmethod
    def train_model(data, model, loss_func, epoches=100, verbose=1, batch_size=128, early_stop = False):
        
        model_callbacks = [
            tf.keras.callbacks.ModelCheckpoint(filepath=dataset_path+'models/model.{epoch:03d}-{val_loss:.4f}.h5'),
            #tf.keras.callbacks.TensorBoard(log_dir=r'.\ogs') 
        ]

        if early_stop:
          model_callbacks.append( 
            tf.keras.callbacks.EarlyStopping(patience=2)
          )
        
        X_data, y_data = data[0].astype(np.float32), data[1].astype(np.float32)
        

        ModelHelper.compile_model(model, loss_func)

        model.fit(X_data,y_data,verbose=verbose,epochs=epoches, batch_size=batch_size, callbacks=model_callbacks, validation_split=0.15)




Model: includes all the CNN related functions
the make_model function builds a Convolutional Neural Net according to the architecture suggested by the paper as shown in the figure below

![cnn model architecture](https://drive.google.com/uc?id=118Olwuh9VL5_Rt_IBg6G5JfuT2KEl-Z5)

In [None]:
class Model:
    
    def __init__(self):
      
        '''
          draw the ROC curve
            
          -- inputs: 
              None
          -- returns:
              None
        '''
        self.num_of_folds = int(5)
        self.dataset = None
        self.data = None
        self.model = None
        self.test_data = None
        
        self.preprocessor = Preprocessor()
        self.preprocessor.load_tokenizer(file_path = dataset_path + 'tokenizer1.pkl')
        self.preprocessor.make_embeddings()
    
    def make_dataset(self):

        '''
          Make tensorflow eager dataset object from the loaded data to model

          -- inputs: 
              None
          -- returns:
              None
        '''

        if self.data == None:
            print('cannot create dataset from empty data object, please load data first then create the dataset iterator')
        
        else:
            X_data, y_data = self.data[0], self.data[1]
            
            def generator():
                for train_index, test_index in KFold(n_splits=self.num_of_folds).split(X_data):
                    X_train, X_test = X_data[train_index], X_data[test_index]
                    y_train, y_test = y_data[train_index], y_data[test_index]
                    yield X_train,y_train,X_test,y_test

            self.dataset =  tf.data.Dataset.from_generator(generator, (tf.string,tf.int64,tf.string,tf.int64))
    
    

    def make_model(self):

        '''
          Make keras CNN model

          -- inputs: 
              None
          -- returns:
              None
        '''

        X_input =  tf.keras.Input(shape=(3, 100), name="input-sentences")
        
        
        embedding_layer = tf.keras.layers.Embedding(input_dim= len(self.preprocessor.tokenizer.word_index)+1, 
                                                    output_dim=self.preprocessor.embedding_dim, 
                                                    input_length=self.preprocessor.max_sequence_length,
                                                    trainable = False,
                                                    name='glove-embedding-layer')
        embedding_layer.build((None,))
        embedding_layer.set_weights([self.preprocessor.embeddings_matrix])
        
        first_sentence =  embedding_layer(X_input[:,0,:])
        second_sentence =  embedding_layer(X_input[:,1,:])
        third_sentence =  embedding_layer(X_input[:,2,:])
        
        convolutional_filters_map = tf.keras.layers.Conv1D(100,kernel_size=(3), activation='relu', use_bias=True, name='features-map')
        
        Xf = convolutional_filters_map(first_sentence)
        Xs = convolutional_filters_map(second_sentence)         
        Xt = convolutional_filters_map(third_sentence)   

        Xf = tf.keras.layers.MaxPool1D(98, name='first-sentence-pool')(Xf)
        Xs = tf.keras.layers.MaxPool1D(98, name='second-sentence-pool')(Xs)
        Xt = tf.keras.layers.MaxPool1D(98, name='third-sentence-pool')(Xt)

        similarity_fnc = SimilarityMatrix((100,100))

        sim_fs = similarity_fnc([Xf, Xs])
        sim_st = similarity_fnc([Xs, Xt])

        X = tf.keras.layers.concatenate([Xf, sim_fs, Xs, sim_st, Xt])

        ## TODO: this architecture requires grad search hyper-parameters tuning
        X = tf.keras.layers.Dense(256, activation='relu', name='fc1', use_bias=True)(X)
        X = tf.keras.layers.Dropout(0.333)(X)

        X = tf.keras.layers.Dense(512, activation='relu', name='fc2', use_bias=True)(X)
        X = tf.keras.layers.Dropout(0.333)(X)

        X = tf.keras.layers.Dense(512, activation='relu', name='fc3', use_bias=True)(X)
        X = tf.keras.layers.Dropout(0.333)(X)

        X = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(X)

        model = tf.keras.Model(inputs=[X_input], outputs=[X])

        self.model = model
    
    def load_data_from_csv(self, data_path, separator=',', split_train_test=False, make_balanced=False):
        
        '''
            load data from CSV file into dataframe
            
            -- inputs:
                data_path: path to file where data is saved
                separator (optional): value seprator to the file, default is comma
        '''
        
        self.data = pd.read_csv(f'{data_path}', sep=',')
        self.data['data'] = self.data['data'].apply(lambda x: x.strip().split('.')[:3])
        self.data['data'] = self.data['data'].apply(lambda x: self.preprocessor.tokenize_data(x))

        if make_balanced:
          freq = list(df['label'].value_counts())
          freq = freq[0]//freq[1]-1
          
          df_coherent = self.data.loc[self.data['label'] == 1]
          df_coherent_replecated = pd.concat([df_coherent]*freq, ignore_index=True)
          self.data = pd.concat([df_coherent_replecated, self.data], ignore_index=True)
        
        if split_train_test:

          X_train, X_test, y_train, y_test = train_test_split(np.array(self.data['data'].values.tolist()), np.array(self.data['label'].values.tolist()).reshape(-1,1), test_size=0.2, random_state=42)
          self.data = (X_train, y_train)
          self.test_data = ( X_test, y_test)  
        else:
          self.data = (np.array(self.data['data'].values.tolist()), np.array(self.data['label'].values.tolist()).reshape(-1,1))

      
    

###training over the imbalanced dataset




In [None]:
m = Model()

In [None]:
m.load_data_from_csv(dataset_path+'dataset.csv')

In [None]:
unique_elements, counts_elements = np.unique(m.data[1], return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

In [None]:
ModelHelper.train_model_kfolds(m.data, Model, ModelHelper.negative_log_likelihood, m.num_of_folds, plot_roc=True)

In [None]:
m.load_data_from_csv(dataset_path+'dataset.csv', make_balanced=True)
unique_elements, counts_elements = np.unique(m.data[1], return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

In [None]:
ModelHelper.train_model_kfolds(m.data, Model, ModelHelper.negative_log_likelihood, m.num_of_folds, plot_roc=True)

###Train actual model 

In [None]:
m.make_model()

In [None]:
ModelHelper.train_model(m.data, m.model, ModelHelper.negative_log_likelihood, batch_size=256)

In [None]:
saveloc = dataset_path + "tmp/model_3_sentence_clique_1.h5"
m.model.save(saveloc)

In [None]:
ModelHelper.train_model_kfolds(m.data, Model, ModelHelper.negative_log_likelihood, m.num_of_folds, plot_roc=True)

In [None]:
m.model.summary()

In [None]:
tf.keras.utils.plot_model(m.model, 'multi_input_and_output_model.png', show_shapes=True)