In [None]:
import pandas as pd
import os
train = pd.read_csv('data-full/train.csv')
test = pd.read_csv('data-full/test.csv')

train.Description.fillna("None", inplace=True)
test.Description.fillna("None", inplace=True)



In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import numpy as np
np.random.seed(2018)


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
#             result.append(lemmatize_stemming(token))
            result.append(token)
    return result


In [None]:
doc_sample = train.Description[0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
train.Description.fillna("None", inplace=True)
train['desc'] = train.Description.map(preprocess)
test.Description.fillna("None", inplace=True)
test['desc'] = test.Description.map(preprocess)

In [None]:
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
len_stat = train.desc.apply(len)
sns.distplot(len_stat)

In [None]:
len_stat = test.desc.apply(len)
sns.distplot(len_stat)

In [None]:
text_data = pd.concat([train.desc, test.desc], axis=0)


In [1]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers import Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras import Input
from keras import Model


Using TensorFlow backend.


In [2]:
## model1

model1_inputs = Input(shape=(486,))
model1_outputs = Input(shape=(30,))
net1 = Embedding(10000, 60, input_length=486)(model1_inputs)
net1 = SpatialDropout1D(0.2)(net1)
net1 = LSTM(30, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(net1)
net1 = LSTM(30, dropout=0.2, recurrent_dropout=0.2, return_sequences=False)(net1)
net1 = Dense(30, activation='relu')(net1)
model1_outputs = net1

model1 = Model(inputs=model1_inputs, outputs = model1_outputs, name='model1')

## Fit the model
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 486)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 486, 60)           600000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 486, 60)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 486, 30)           10920     
_________________________________________________________________
lstm_2 (LSTM)                (None, 30)                7320      
_________________________________________________________________
dense_1 (Dense)              (None, 30)                930       
Total params: 619,170
Trainable params: 619,170
Non-trainable params: 0
_________________________________________________________________


In [3]:
# model2
model2_inputs = Input(shape=(30,))
model2_outputs = Input(shape=(30,))

net2 = Dropout(0.2)(model2_inputs)
net2 = Dense(5, activation='softmax')(net2)
model2_outputs = net2

model2 = Model(inputs=model2_inputs, outputs = model2_outputs, name='model2')

## Fit the model
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 30)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 155       
Total params: 155
Trainable params: 155
Non-trainable params: 0
_________________________________________________________________


In [4]:
# model-full
model_full_inputs = Input(shape=(486,))
model_full_outputs = Input(shape=(5,))

first = model1(model_full_inputs)
second = model2(first)
model_full_outputs = second

model_full = Model(inputs=model_full_inputs, outputs = model_full_outputs, name='model_full')
# model_full.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

model_full.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 486)               0         
_________________________________________________________________
model1 (Model)               (None, 30)                619170    
_________________________________________________________________
model2 (Model)               (None, 5)                 155       
Total params: 619,325
Trainable params: 619,325
Non-trainable params: 0
_________________________________________________________________


In [None]:
vocabulary_size = 10000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(text_data)

sequences = tokenizer.texts_to_sequences(text_data)
data = pad_sequences(sequences, maxlen=486)
print(data.shape)

In [None]:
training_text = data[:14993]
testing_text = data[14993:]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train , y_val = train_test_split(training_text, target, 
                                                    stratify=target,
                                                    test_size=0.2)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

In [None]:
import tensorflow as tf
y_train = tf.keras.utils.to_categorical(y_train,5)
y_val = tf.keras.utils.to_categorical(y_val,5)
y_train.shape, y_val.shape

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
earlystopper = EarlyStopping(patience=2, verbose=1)
checkpointer = ModelCheckpoint('text_features.h5', verbose=1, 
                               save_best_only=True, 
                               mode='auto', period=2)

model_full.compile(optimizer='adam', 
                  loss = 'categorical_crossentropy', 
                  metrics = ['accuracy'])

history = model_full.fit( X_train, y_train,
                          batch_size=32,
                          epochs = 50,
                          validation_data=(X_val, y_val),
                          verbose = True,
                          callbacks=[earlystopper,checkpointer])
