In [1]:
import numpy as np
import pandas as pd
import os
from key_words_extract import *

In [2]:
NB_CATEGORIES = 51
PADDING = 100

dataFolder = './challenge_data'
xPath = os.path.join(dataFolder, 'input_train.csv')
yPath = os.path.join(
    dataFolder, 'challenge_output_data_training_file_predict_the_expected_answer.csv')

x = pd.read_csv(xPath, delimiter=';', usecols=[1]).values.ravel()
y = pd.read_csv(yPath, delimiter=';', usecols=[1]).values.ravel()

In [3]:
#selectCat(x, y, 10)
# if a dictionnary {cat: [keywords]} is wanted, set toVector = False
keywords, key_weights = keyWordsExtract(x, y, k = 10, toVector= True)

In [8]:
key_weights

[0.20588235294117646,
 0.18137254901960784,
 0.17647058823529413,
 0.13235294117647059,
 0.13235294117647059,
 0.12745098039215685,
 0.10294117647058823,
 0.088235294117647065,
 0.083333333333333329,
 0.078431372549019607,
 0.17391304347826086,
 0.13043478260869565,
 0.086956521739130432,
 0.086956521739130432,
 0.086956521739130432,
 0.086956521739130432,
 0.086956521739130432,
 0.086956521739130432,
 0.043478260869565216,
 0.043478260869565216,
 0.36363636363636365,
 0.22727272727272727,
 0.18181818181818182,
 0.18181818181818182,
 0.18181818181818182,
 0.13636363636363635,
 0.13636363636363635,
 0.13636363636363635,
 0.090909090909090912,
 0.090909090909090912,
 0.375,
 0.375,
 0.25,
 0.25,
 0.125,
 0.125,
 0.125,
 0.125,
 0.125,
 0.125,
 0.14473684210526316,
 0.11842105263157894,
 0.078947368421052627,
 0.078947368421052627,
 0.065789473684210523,
 0.065789473684210523,
 0.065789473684210523,
 0.052631578947368418,
 0.039473684210526314,
 0.039473684210526314,
 0.15686274509803921,

In [7]:
class CustomModel():
    '''Generic workflow class.'''

    def __init__(self, **kwargs):
        self.verbose = kwargs.get('verbose', False)

        self.nbCategories = kwargs['nbCategories']
        self.paddingLength = PADDING
        self.maxNumberWords = (1e5)
        self.trainable = kwargs.get('trainable', False)

        self.tokenizer = text.Tokenizer(
            num_words=self.maxNumberWords,
            filters="!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'´’™©®«»",
            split=" "
        )

    def preprocess(self, x):
        '''Turns sentences into padded word sequences.'''

        self.tokenizer.fit_on_texts(x)
        sequences = self.tokenizer.texts_to_sequences(x)
        sequences = sequence.pad_sequences(sequences, self.paddingLength)

        return sequences

    def preprocessLabels(self, labels):
        return to_categorical(labels, num_classes=self.nbCategories)
    
    # return a list of 51 sublists each containing k tuples (keyword, nb_occurences)
    def extract_top_k(self, x, y, k, stop_words = STOP_WORDS):
        best_vocab = []
        nb_cat = len(set(y[:,1]))
        for cat in np.arange(nb_cat):
            questions = selectCat(x, y, cat)[0][:,1]
            cat_vocab = vectorizeVocabulary(corpus=questions)
            cat_best_vocab = []
            for w in cat_vocab:
                if len(cat_best_vocab) < k:
                    if w[0] not in stop_words:
                        cat_best_vocab.append(w)           
            best_vocab.append(cat_best_vocab)
            
            self.best_vocab = best_vocab
        return best_vocab

    def train(self, x, y, epochs= 10, batch_size=32, validation_data=None):
        if callback == True:
            filepath= 'models_checkpoints/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5'
            checkpoint = ModelCheckpoint(filepath, monitor='val_acc',
                                         verbose=1, save_best_only=True, mode='max')
            callbacks_list = [checkpoint]

            self.model.fit(x, y, shuffle='batch', epochs=epochs,
                           batch_size=batch_size, validation_data=validation_data,
                           callbacks=callbacks_list)
        else:
            self.model.fit(x, y, shuffle='batch', epochs=epochs,
                           batch_size=batch_size, validation_data=validation_data)

    def evaluate(self, x, y):
        return self.model.evaluate(x, y)

    def predict(self, x):
        return self.model.predict(x)