In [1]:
# Required Python utilities
import numpy as np
import pandas as pd

from collections import Counter
import re
from langdetect import detect
from bs4 import BeautifulSoup
from markdown import markdown
from lxml import etree
import os
import random
import tqdm
import itertools 
import pickle

from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

## Deep Learning imports for the classifiers
os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Concatenate
from keras.models import Model

##Â ML required imports (for clustering)
from sklearn import metrics
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.preprocessing import scale, StandardScaler
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN


# Topic modeling imports
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

##Â NLP related imports
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer


# visualization imports
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import base64
import io
%matplotlib inline
sns.set() 

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
imported_notebook = False ## Set this flag to true if importing this code from another notebook

In [3]:
GLOVE_DIR = "Data/Iterative-models-building/Training data/"
GATHERED_DATA_FOLDER = "Data/Iterative-models-building/Gathered_data/Conventions/"

In [4]:
## Classificaiton NETWORKs Configuration parameters
MAX_SEQUENCE_LENGTH = 40
MAX_NB_WORDS = 10000
EMBEDDING_DIM = 100 ## 100, 200 or 300
VALIDATION_SPLIT = 0.2

NUM_EPOCHS = 20

In [5]:
def create_tokenizer(df, max_words=MAX_NB_WORDS):
    texts = df['text'].values
    
    _tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
    _tokenizer.fit_on_texts(texts)
    
    return _tokenizer 

In [6]:
def read_glove_embeddings():
    ## Reading GLOVE (precalculated word embeddings)
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(EMBEDDING_DIM)))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    return embeddings_index

embeddings_index = read_glove_embeddings()
    


In [7]:
## Calculate AUC
## As explained here: https://stackoverflow.com/questions/41032551/how-to-compute-receiving-operating-characteristic-roc-and-auc-in-keras
class roc_callback(Callback):
    def __init__(self,validation_data):#training_data,validation_data):
        #self.x = training_data[0]
        #self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        #y_pred = self.model.predict(self.x)
        #roc = roc_auc_score(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        #print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        print("Roc-AUC Validation: {}".format(str(round(roc_val,4))))
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [8]:
def get_model_matches_proba(sequences, model):
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    preds = model.predict(data)

    return preds[:,1]


def calculate_matches(sentences, _models, _tokenizers):
    _repos_matches = {}

    ## Getting classification confidence per model for each repo
    for model_key in _models.keys():
            
        tokenized_sentences = _tokenizers[model_key].texts_to_sequences(sentences)
        preds1 = get_model_matches_proba(tokenized_sentences, _models[model_key])
        
        """
        preds2 = get_model_matches_proba(tokenized_sentences, _models[model_key])
        
        data = pad_sequences(tokenized_sentences, maxlen=MAX_SEQUENCE_LENGTH)
        preds2 = _models[model_key].predict(data)

        _repos_matches[model_key] = [preds1, preds2]
        """
        _repos_matches[model_key] = preds1
    return _repos_matches

In [9]:
def train_DL_model(x_train, y_train, x_val, y_val, tokenizer, num_epochs=NUM_EPOCHS):
    
    word_index = tokenizer.word_index
    
    print('\nNumber of elements from each class in traing and validation set ')
    print(y_train.sum(axis=0))
    print(y_val.sum(axis=0))


    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    

    convs = []
    filter_sizes = [3,4,5]

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    for fsz in filter_sizes:
        l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(5)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)#

    l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
    l_pool1 = MaxPooling1D(5)(l_cov1)

    l_flat = Flatten()(l_pool1)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(2, activation='softmax')(l_dense)
    
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])


    """
    # train a 1D convnet with global maxpooling
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    preds = Dense(len(labels_index), activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    """
    
    print("model fitting - more complex convolutional neural network")
    model.summary()
    train_history = model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=num_epochs, batch_size=50, callbacks=[roc_callback(validation_data=(x_val, y_val))])

    return model,tokenizer,x_val,y_val, train_history


In [10]:
def train_DL_model_not_validation(x_train, y_train, tokenizer, num_epochs=NUM_EPOCHS):
    
    word_index = tokenizer.word_index
    
    print('\nNumber of elements from each class in traing and validation set ')
    print(y_train.sum(axis=0))
    print(y_val.sum(axis=0))

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)


    convs = []
    filter_sizes = [3,4,5]

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    for fsz in filter_sizes:
        l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(5)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)#

    l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
    l_pool1 = MaxPooling1D(5)(l_cov1)

    l_flat = Flatten()(l_pool1)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(2, activation='softmax')(l_dense)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    print("model fitting - more complex convolutional neural network")
    model.summary()
    train_history = model.fit(x_train, y_train,epochs=num_epochs, batch_size=50)
    return model,tokenizer,train_history


In [11]:
def train_DL_models(df_train,
                    data_class_column="convention", 
                    data_label_column="label",
                   df_val=None,
                    tokenizer=None,
                   random_seed=None,
                   use_validation=True):
    
    
    _models = {}
    _tokenizers = {}
    _data_val_x = {}
    _data_val_y = {}
    _train_histories = {}
    if tokenizer is None:        
        if df_val is None:
            tokenizer = create_tokenizer(df_train)
        else:
            tokenizer = create_tokenizer(pd.concat([df_train, df_val]))

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    
    for convention in df[data_class_column].unique():
        
        print("----------------------------------------------------------------")
        print("            {}                  ".format(convention))
        print("----------------------------------------------------------------")
        
        tmp_df_train = df_train[df_train[data_class_column] == convention]
        if use_validation:
            if df_val is None:

                texts = tmp_df_train['text'].values
                labels = tmp_df_train[data_label_column].values

                sequences = tokenizer.texts_to_sequences(texts)


                data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

                labels = to_categorical(np.asarray(labels))
                print('Shape of data tensor:', data.shape)
                print('Shape of label tensor:', labels.shape)

                indices = np.arange(data.shape[0])
                if random_seed is not None:
                    np.random.seed(random_seed)
                np.random.shuffle(indices)
                data = data[indices]
                labels = labels[indices]
                nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])



                x_train = data[:-nb_validation_samples]
                y_train = labels[:-nb_validation_samples]
                x_val = data[-nb_validation_samples:]
                y_val = labels[-nb_validation_samples:]


                _model, _tokenizer, _x_val, _y_val, _train_h = train_DL_model(x_train, y_train, x_val, y_val, tokenizer)
            else:

                tmp_df_val = df_val[df_val[data_class_column] == convention]

                train_texts = tmp_df_train['text'].values
                train_labels = tmp_df_train[data_label_column].values

                val_texts = tmp_df_val['text'].values
                val_labels = tmp_df_val[data_label_column].values

                train_sequences = tokenizer.texts_to_sequences(train_texts)
                val_sequences = tokenizer.texts_to_sequences(val_texts)



                x_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
                x_val = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)

                y_train = to_categorical(np.asarray(train_labels))
                y_val = to_categorical(np.asarray(val_labels))

                _model, _tokenizer, _x_val, _y_val, _train_h = train_DL_model(x_train, y_train, x_val, y_val, tokenizer)
        
            _models[convention] = _model
            _tokenizers[convention] = _tokenizer
            _data_val_x[convention] = _x_val
            _data_val_y[convention] = _y_val
            _train_histories[convention] = _train_h
        else:
            texts = tmp_df_train['text'].values
            labels = tmp_df_train[data_label_column].values

            sequences = tokenizer.texts_to_sequences(texts)


            data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

            labels = to_categorical(np.asarray(labels))
            print('Shape of data tensor:', data.shape)
            print('Shape of label tensor:', labels.shape)

            
            x_train = data
            y_train = labels
        
            _model, _tokenizer, _train_h = train_DL_model_not_validation(x_train, y_train, tokenizer)

            _models[convention] = _model
            _tokenizers[convention] = _tokenizer
            _train_histories[convention] = _train_h
        

        print("\n\n\n")
        
    return (_models, _tokenizers, _data_val_x, _data_val_y, _train_histories)

In [12]:
if not imported_notebook:
    df = pd.read_csv("Data/Iterative-models-building/Training data/Conventions/training_aggregated_conventions.tsv", sep="\t")
    df = df[df['convention'] =='Industrial']
    
    def remove_line_breaks(text_str):
        text_str = text_str.replace("\\n", " ")
        return text_str
    
    df['text'] = df['text'].apply(remove_line_breaks)

In [13]:
df.head()

Unnamed: 0,label,text,provenance,convention
3224,1,"also, you can install drivers for various vm p...",Manually_gathered,Industrial
3225,1,will return a bar plot comparing the models on...,Manually_gathered,Industrial
3226,1,* expected result: 2.63% test error rate with ...,Manually_gathered,Industrial
3227,1,"if you know your terminal size, you can contro...",Manually_gathered,Industrial
3228,1,- to provide a range of non-functional feature...,Manually_gathered,Industrial


In [14]:
gathered_data_files = [f for f in os.listdir(GATHERED_DATA_FOLDER) 
                              if (os.path.isfile(os.path.join(GATHERED_DATA_FOLDER, f)) 
                                  and not f.startswith( '.' ) 
                                  and "gathered_" in f)]
gathered_data_files

['gathered_green_test.tsv',
 'gathered_s2_17-19_ki.tsv',
 'gathered_news_sentences.tsv',
 '_gathered_github_sentences.tsv',
 'gathered_github_sentences.tsv',
 'gathered_s2_17-19_ki_kw.tsv']

In [15]:
gathered_dfs = []

for f_name in gathered_data_files:
    tmp_df = pd.read_csv(os.path.join(GATHERED_DATA_FOLDER, f_name), sep="\t")
    tmp_df = tmp_df.rename(columns={"sentence": "text"})
    
    gathered_dfs.append(tmp_df)

gathered_dfs.append(df)##Adding training data sentences

In [16]:
extended_tokenizer = create_tokenizer(pd.concat(gathered_dfs))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [17]:
if not imported_notebook:
    models = train_DL_models(df,
                data_class_column="convention", 
                data_label_column="label",
                df_val=None,
                tokenizer=extended_tokenizer,
                random_seed=0, 
                use_validation=True)

Found 85688 unique tokens.
----------------------------------------------------------------
            Industrial                  
----------------------------------------------------------------
Shape of data tensor: (1738, 40)
Shape of label tensor: (1738, 2)

Number of elements from each class in traing and validation set 
[910. 481.]
[231. 116.]







model fitting - more complex convolutional neural network
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 40, 100)      8568900     input_1[0][0]                    
_____________________________________________________________________



Train on 1391 samples, validate on 347 samples
Epoch 1/20
Roc-AUC Validation: 0.8314
Epoch 2/20
Roc-AUC Validation: 0.9092
Epoch 3/20
Roc-AUC Validation: 0.9266
Epoch 4/20
Roc-AUC Validation: 0.9462
Epoch 5/20
Roc-AUC Validation: 0.9335
Epoch 6/20
Roc-AUC Validation: 0.9605
Epoch 7/20
Roc-AUC Validation: 0.9596
Epoch 8/20
Roc-AUC Validation: 0.9639
Epoch 9/20
Roc-AUC Validation: 0.9643
Epoch 10/20
Roc-AUC Validation: 0.9585
Epoch 11/20
Roc-AUC Validation: 0.9658
Epoch 12/20
Roc-AUC Validation: 0.9683
Epoch 13/20
Roc-AUC Validation: 0.9614
Epoch 14/20
Roc-AUC Validation: 0.9396
Epoch 15/20
Roc-AUC Validation: 0.9629
Epoch 16/20
Roc-AUC Validation: 0.9669
Epoch 17/20
Roc-AUC Validation: 0.9667
Epoch 18/20
Roc-AUC Validation: 0.9612
Epoch 19/20
Roc-AUC Validation: 0.9663
Epoch 20/20
Roc-AUC Validation: 0.9671






In [18]:
if not imported_notebook:
    texts = df['text'].values
    labels = df['label'].values

    indices = np.arange(len(texts))
    np.random.seed(0)
    np.random.shuffle(indices)
    texts = texts[indices]
    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * len(texts))

    x_train = texts[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = texts[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]

    df_train = pd.DataFrame(columns=['text', 'label', 'convention'])
    df_train['text'] = x_train
    df_train['label'] = y_train
    df_train['convention'] = 'Industrial'


    df_val = pd.DataFrame(columns=['text', 'label', 'convention'])
    df_val['text'] = x_val
    df_val['label'] = y_val
    df_val['convention'] = 'Industrial'

    print("\n\n===========================")
    print("WITH VALIDATION!")
    print("===========================\n\n")
    models2 = train_DL_models(df_train,
                    data_class_column="convention", 
                    data_label_column="label",
                    df_val=df_val,
                    tokenizer=extended_tokenizer,
                    random_seed=0, use_validation=True)




WITH VALIDATION!


Found 85688 unique tokens.
----------------------------------------------------------------
            Industrial                  
----------------------------------------------------------------

Number of elements from each class in traing and validation set 
[910. 481.]
[231. 116.]




model fitting - more complex convolutional neural network
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 40, 100)      8568900     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 38, 128)      38528       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 37, 128)      51328       embedding_2[0][0]                
__________________________________

In [19]:
if not imported_notebook:
    models2_tokenizer = models2[1]['Industrial']
    val_seq = models2_tokenizer.texts_to_sequences(x_val)
    val_seq = pad_sequences(val_seq, maxlen=MAX_SEQUENCE_LENGTH)

    preds2 = models2[0]['Industrial'].predict(val_seq)
    print("Real label for samples classified with value 0")
    display(y_val[preds2[:,0]>preds2[:,1]])
    print("Real label for samples classified with value 1")
    display(y_val[preds2[:,0]<preds2[:,1]])

Real label for samples classified with value 0


array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

Real label for samples classified with value 1


array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0])

In [20]:
if not imported_notebook:
    print("\n\n===========================")    
    print("WITHOUT VALIDATION!")
    print("===========================\n\n")
    models3 = train_DL_models(df_train,
                    data_class_column="convention", 
                    data_label_column="label",
                    df_val=None,
                    tokenizer = models2_tokenizer,
                    random_seed=0, use_validation=False)




WITHOUT VALIDATION!


Found 85688 unique tokens.
----------------------------------------------------------------
            Industrial                  
----------------------------------------------------------------
Shape of data tensor: (1391, 40)
Shape of label tensor: (1391, 2)

Number of elements from each class in traing and validation set 
[910. 481.]
116




model fitting - more complex convolutional neural network
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 40, 100)      8568900     input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 38, 128)      38528       embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 37, 128)      51328       embedding_3[0][0]                
__________________________________

In [21]:
if not imported_notebook:
    val_seq = models2_tokenizer.texts_to_sequences(x_val)
    val_seq = pad_sequences(val_seq, maxlen=MAX_SEQUENCE_LENGTH)

    preds2 = models2[0]['Industrial'].predict(val_seq)
    print("Real label for samples classified with value 0")
    display(y_val[preds2[:,0]>preds2[:,1]])
    print("Real label for samples classified with value 1")
    display(y_val[preds2[:,0]<preds2[:,1]])

Real label for samples classified with value 0


array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

Real label for samples classified with value 1


array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0])

In [22]:
if not imported_notebook:
    val_seq = models2_tokenizer.texts_to_sequences(x_val)
    val_seq = pad_sequences(val_seq, maxlen=MAX_SEQUENCE_LENGTH)

    preds2 = models2[0]['Industrial'].predict(val_seq)
    print("Real label for samples classified with value 0")
    display(y_val[preds2[:,0]>preds2[:,1]])
    print("Real label for samples classified with value 1")
    display(y_val[preds2[:,0]<preds2[:,1]])

Real label for samples classified with value 0


array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

Real label for samples classified with value 1


array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0])