In [None]:
# build a model based on similarity of keys words
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
STOP_WORDS = ['alors','au','aucuns','aussi','autre','avant','avec','avoir','bon','car',
              'ce','cela','ces','ceux','chaque','ci','comme','comment','dans','des',
              'du','dedans','dehors','depuis','devrait','doit','donc','dos','début','elle',
              'elles','en','encore','essai','est','et','eu','fait','faites','fois',
              'font','hors','ici','il','ils','je','juste','la','le','les',
              'leur','là','ma','maintenant','mais','mes','mine','moins','mon','mot',
              'même','ni','nommés','notre','nous','ou','où','par','parce','pas',
              'peut','peu','plupart','pour','pourquoi','quand','que','quel','quelle','quelles',
              'quels','qui','sa','sans','ses','seulement','si','sien','son','sont',
              'sous','soyez','sujet','sur','ta','tandis','tellement','tels','tes','ton',
              'tous','tout','trop','très','tu','voient','vont','votre','vous','vu',
              'ça','étaient','état','étions','été','être', 'de', 'un', 'une', 'ai', 'ne', 'on']

def vectorizeVocabulary(corpus, verbose=False, density=False):
    # Generate word tokens
    countVectorizer = CountVectorizer(input='content')
    countVector = countVectorizer.fit_transform(corpus)
    vocabulary = countVectorizer.vocabulary_
    wordCount = np.sum(countVector, axis=0)
    totalWordCount = np.sum(wordCount)

    vocabulary = list(map(partial(parseWord, wordCount,
                                  totalWordCount, density), vocabulary.items()))
    
    # Sort words by usage
    sortedVocabulary = sorted(vocabulary, key=lambda x: x[1], reverse=True)
    
    if verbose:
        print("countVector.shape: {}".format(str(countVector.shape)))
        print("wordCount.shape: {}".format(str(wordCount.shape)))
        print(sortedVocabulary[:5])
    
    return sortedVocabulary

def selectCat(x, y, cat_index):
    cat_ix = []
    for xx, yy in zip(x, y):
        if yy[1] == cat_index:
            cat_ix.append(xx[0])
    cat_ix = np.array(cat_ix)
    selected_questions = x[cat_ix]
    return selected_questions

def extract_top_k(x, y, k, stop_words = STOP_WORDS):
    best_vocab = []
    nb_cat = len(set(y[:,1]))
    for cat in np.arange(nb_cat):
        questions = selectCat(x, y, cat)[0][:,1]
        cat_vocab = vectorizeVocabulary(corpus=questions)
        cat_best_vocab = []
        for w in cat_vocab:
            if len(cat_best_vocab) < k:
                if w[0] not in stop_words:
                    cat_best_vocab.append(w)           
        best_vocab.append(cat_best_vocab)
    return best_vocab

In [None]:
class CustomModel():
    '''Generic workflow class.'''

    def __init__(self, **kwargs):
        self.verbose = kwargs.get('verbose', False)

        self.nbCategories = kwargs['nbCategories']
        self.paddingLength = PADDING
        self.maxNumberWords = (1e5)
        self.trainable = kwargs.get('trainable', False)

        self.tokenizer = text.Tokenizer(
            num_words=self.maxNumberWords,
            filters="!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'´’™©®«»",
            split=" "
        )

    def preprocess(self, x):
        '''Turns sentences into padded word sequences.'''

        self.tokenizer.fit_on_texts(x)
        sequences = self.tokenizer.texts_to_sequences(x)
        sequences = sequence.pad_sequences(sequences, self.paddingLength)

        return sequences

    def preprocessLabels(self, labels):
        return to_categorical(labels, num_classes=self.nbCategories)
    
    # return a list of 51 sublists each containing k tuples (keyword, nb_occurences)
    def extract_top_k(self, x, y, k, stop_words = STOP_WORDS):
        best_vocab = []
        nb_cat = len(set(y[:,1]))
        for cat in np.arange(nb_cat):
            questions = selectCat(x, y, cat)[0][:,1]
            cat_vocab = vectorizeVocabulary(corpus=questions)
            cat_best_vocab = []
            for w in cat_vocab:
                if len(cat_best_vocab) < k:
                    if w[0] not in stop_words:
                        cat_best_vocab.append(w)           
            best_vocab.append(cat_best_vocab)
            
            self.best_vocab = best_vocab
        return best_vocab

    def train(self, x, y, epochs= 10, batch_size=32, validation_data=None):
        if callback == True:
            filepath= 'models_checkpoints/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5'
            checkpoint = ModelCheckpoint(filepath, monitor='val_acc',
                                         verbose=1, save_best_only=True, mode='max')
            callbacks_list = [checkpoint]

            self.model.fit(x, y, shuffle='batch', epochs=epochs,
                           batch_size=batch_size, validation_data=validation_data,
                           callbacks=callbacks_list)
        else:
            self.model.fit(x, y, shuffle='batch', epochs=epochs,
                           batch_size=batch_size, validation_data=validation_data)

    def evaluate(self, x, y):
        return self.model.evaluate(x, y)

    def predict(self, x):
        return self.model.predict(x)