In [None]:
## python 3.6
!pip install tensorflow==1.13.1
!pip install git+https://www.github.com/keras-team/keras-contrib.git
!pip install keras

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from itertools import chain
import nltk
#nltk.download('punkt')
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import backend as K
from keras import initializers
from keras import regularizers
from keras import constraints
from keras.engine import Layer, InputSpec
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.core import Dropout
from keras.layers.wrappers import Bidirectional
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential
from keras_contrib.layers import CRF

In [14]:
def labeling(label, string_chunk):
    return [(word, label) for word in word_tokenize(string_chunk)]

def one_hot_encoding(x, y, target = 'oem'):
    y = np.array(y)
    max_sentence_len = max(map(len, x))
    print(max_sentence_len)
    all_tags = set(chain(*y))
    NUM_TAGS = len(all_tags)
    TAGS_MAP = dict(zip(all_tags, range(NUM_TAGS)))
    TAGS_MAP = {'0': 0, target: 1, 'NIL': 2}
    y = list(map(lambda x: [TAGS_MAP[t] for t in x], y))
    y = pad_sequences(y, max_sentence_len, padding='pre')
    y = np.array([to_categorical(t, NUM_TAGS) for t in y])
    return x, y

In [90]:
class DataPreparation:
    def __init__(self, titles_filepath = None, default_n_classes=3):
        self.default_n_classes = default_n_classes
        self.titles_filepath = titles_filepath

    def load_glove(self, glove_path = "glove/glove.6B.50d.txt"):
        self.wordvecs = open(glove_path, encoding='utf-8')
        self.word_to_ix_map = {}
        for line in self.wordvecs:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.word_to_ix_map[word] = coefs
        self.wordvecs.close()
    
    ## TODO: check for word sepration of special characters
    def target_tagging(self, df):
        t, attr = df.title, df[self.target]
        start = (t.lower()).find(attr.lower())
        end = start + len(attr)
        labeled_title = labeling('0', t[:start]) + \
                    labeling(self.target, t[start:end]) + labeling('0', t[end:])
        return labeled_title
    
    def read_data(self, df, target = 'oem'):
        self.target = target
        df['origin_title'] = df['title'].values
        df['title'] = df.apply(self.target_tagging, axis=1)
        self.df = df
        raw_w, raw_t, raw_data, = [], [], []
        for row in self.df.title:
            for word, tag in row:
                raw_w.append(word)
                raw_t.append(tag)
            raw_data.append((tuple(raw_w), tuple(raw_t)))
            raw_w, raw_t = [], []
        self.raw_data = raw_data

    def prepare_data(self, d = 50): #d is dimension of word vectors
        all_x, all_y = [], []
        max_sentence_len = 49
        for words, tags in self.raw_data:
            encoded_words, encoded_tags = [], []
            for w, t in zip(words, tags):
                if w.lower() in self.word_to_ix_map:
                    encoded_words.append(self.word_to_ix_map[w.lower()])
                    encoded_tags.append(t)
                else:
                    encoded_words.append(np.ones(d))
                    encoded_tags.append(t)
            nil_x = np.zeros(d)
            nil_y = 'NIL'
            pad_length = max_sentence_len - len(encoded_words)
            if pad_length<0:
                print(pad_length)
            all_x.append(np.array(((pad_length) * [nil_x]) + encoded_words))
            all_y.append(np.array(((pad_length) * [nil_y]) + encoded_tags))
        all_x, all_y = one_hot_encoding(all_x, all_y, self.target)
        # all_x, all_y = np.array(all_x), np.array(all_y)
        all_x= np.array(all_x)
        return all_x, all_y

    def train_test_split(self, all_x, all_y, test_size=0.2):
        x_train, x_test, y_train, y_test, ind_train, ind_test = train_test_split(all_x, all_y, range(len(all_x)), test_size=test_size)
        test_df = self.df[self.df.index.isin(ind_test)].reindex(ind_test).reset_index().origin_title
        return x_train, x_test, y_train, y_test, test_df

In [122]:
## TODO: train hyperparameter, fine tuning
## TODO: augment train dataset with EM algorithm?
class LstmAttributeDetector:
    def __init__(self):
        self.model = None

    def create_model(self, dropout=0.5, units=150):
        self.model = Sequential()
        self.model.add(Bidirectional(LSTM(units, return_sequences=True),
                                     input_shape=(49, 50)))
        self.model.add(Dropout(dropout))
        self.model.add(Bidirectional(LSTM(units, return_sequences=True)))
        self.model.add(Dropout(dropout))
        self.model.add(TimeDistributed(Dense(3)))
        self.model.add(Dropout(dropout))
        #crf = ChainCRF()
        crf = CRF(3)  # CRF layer, n_tags+1(PAD)
        # model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
        self.model.add(crf)
        self.model.compile(loss=crf.loss_function, optimizer='nadam',
                           metrics=['categorical_accuracy'])
        

    def fit(self, train_x, train_y, epochs=5, batch=100):
        self.model.fit(train_x, train_y, epochs=epochs, batch_size=batch)

    def save(self, filepath):
        self.model.save(filepath)

    def print_summary(self):
        print(self.model.summary())

    def predict(self, test_x):        
        preds = self.model.predict(test_x, verbose=0).argmax(axis=-1)
        return preds

    def evaluate(self, test_x, test_y):  ## TODO look further into different train test split
        y_pred = self.model.predict(test_x, verbose=0).argmax(axis=-1)
        y_test = test_y.argmax(axis=-1)
        acc = [np.array_equal(y_pred[i], y_test[i]) for i in
               range(len(y_pred))].count(True) / len(y_pred)
        return acc
    
#     def predict_output(self, test_x, test_df): ## TODO fix dimension not fit problem
#         token_df = test_df.apply(word_tokenize)
#         ind = self.model.predict(test_x, verbose=0).argmax(axis=-1)
#         ind = [[z for z in obs if z!=2] for obs in ind]
#         ind = [[False if elem == 0 else True for elem in obs] for obs in ind]
#         output = [' '.join(np.array(token_df[i])[np.array(ind[i])]) for i in range(len(ind))]
#         preds = pd.concat([test_df, pd.DataFrame(output, columns=['predictions'])], axis=1)
#         return preds

In [146]:
data = pd.read_csv("phone.csv") ## backmarket and uptradeit
data = data.dropna()
data = data.reset_index()
prep = DataPreparation()
print('Loading word embeddings...')
prep.load_glove()
print('Reading data...')
prep.read_data(data, 'model_name')
print('Data preparing..')
x, y = prep.prepare_data()
x_train, x_test, y_train, y_test, test_df = prep.train_test_split(x,y)
model = LstmAttributeDetector()
print('Model fitting...')
model.create_model()
model.fit(x_train, y_train, epochs=8)
print('Accuracy for whole titles: {}'.format(model.evaluate(x_test, y_test)))


Loading word embeddings...
Reading data...
Data preparing..
49
Model fitting...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Accuracy for whole titles: 0.8633540372670807
