In [1]:
import pandas as pd
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
import numpy as np
from utils import f1
from sklearn.metrics import classification_report

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load the data

In [2]:
df = pd.read_table('../data/processed/SemEval2014/restaurants_train.tsv').drop('word_id', axis=1)
df_test = pd.read_table('../data/processed/SemEval2014/restaurants_gold.tsv').drop('word_id', axis=1).iloc[:, 1:]
df_comb = pd.concat([df, df_test])

In [3]:
df_comb.

Unnamed: 0,review_id,word,lemma,upos,xpos,head,deprel,lemma_index,label
0,0,But,but,CCONJ,CC,6,cc,but_0,O
1,0,the,the,DET,DT,3,det,the_1,O
2,0,staff,staff,NOUN,NN,6,nsubj,staff_2,B
3,0,was,be,AUX,VBD,6,cop,be_3,O
4,0,so,so,ADV,RB,6,advmod,so_4,O
...,...,...,...,...,...,...,...,...,...
12747,799,when,when,ADV,WRB,26,mark,when_22,O
12748,799,on,on,ADP,IN,26,case,on_23,O
12749,799,warm,warm,ADJ,JJ,26,amod,warm_24,B
12750,799,pitas,pita,NOUN,NNS,21,obl,pita_25,I


In [4]:
def get_lookup_dicts(series):
    uniq_tokens = series.unique()
    n = len(uniq_tokens)
    w2id = dict(zip(uniq_tokens, range(1, n+1)))
    id2w = {i: w for i, w in w2id.items()}
    
    return w2id, id2w

In [5]:
def create_X(df, col, w2id=None, lower=False, subset='train'):
    df = df.copy()
    if lower:
        df[col] = df[col].str.lower()
        
    
    if w2id is None:
        w2id, id2w = get_lookup_dicts(df[col])
    
    df['wid'] = df[col].map(lambda x: w2id.get(x, np.nan))
    df = df.dropna(subset=['wid'], axis=0)
    
    token_seq = df.groupby('review_id').wid.apply(lambda x: list(x)).tolist()
    X = pad_sequences(token_seq, maxlen=83, padding='post')
    
    if subset == 'train':
    
        return X, w2id, id2w
    else:
        return X

In [13]:
X_train, w2id, id2w = create_X(df, 'word', lower=True)
X_test = create_X(df_test, col='word', w2id=w2id, lower=True, subset='test')

In [24]:
pos_train, pos2id, id2pos = create_X(df, 'xpos')
pos_test = create_X(df_test, col='xpos', w2id=pos2id,  subset='test')

In [15]:
label2id = {'B': 1, 'I': 2, 'O': 0}

def one_hot(x):
    arr = [0, 0, 0]
    i = label2id.get(x, 0)
    arr[i] = 1
    return arr

def create_y(df):
    
    df = df.copy()
    df['y'] = df.label.apply(one_hot)

    y = df.groupby('review_id').y.apply(lambda x: list(x)).tolist()
    y = pad_sequences(y, maxlen=83, padding='post')
    
    return y

In [25]:
y_train = create_y(df_comb)
y_test = create_y(df_test)

# Load word vector

In [18]:
from dl_utils import create_embedding_matrix

general_embedding = create_embedding_matrix(w2id, embed_dim=200, embed_type='glove.twitter', 
                                            concat_pos_tag=False)
domain_embedding = create_embedding_matrix(w2id, embed_type='restaurants', 
                                            concat_pos_tag=False)

embedding_matrix = np.hstack([general_embedding, domain_embedding])

Word vectors found for 91.21% of vocabulary
4815 nan
Word vectors found for 99.98% of vocabulary


# CNN architecture

![](../reports/DoubleCNN.png)

In [19]:
from keras.layers import Input, Embedding, Conv1D, Dropout, Dense, Masking, Multiply, Concatenate
from keras.models import Model
from keras.optimizers import Adam
import keras.backend as K

In [26]:
kernel_size = 5
embedding_size = embedding_matrix.shape[1]

In [48]:
sent = Input(shape=(83, ), name='sentence')
# pos = Input(shape=(83, ), name='pos')


# Embedding layer
x = Embedding(input_dim=len(w2id) + 1,
                              output_dim=embedding_size,
                              input_length=83,
                              weights=[embedding_matrix],
                              mask_zero=False,
                              trainable=False)(sent)

# pos_emb = Embedding(input_dim=len(pos2id) + 1,
#                               output_dim=20,
#                               input_length=83,
#                               mask_zero=False,
#                               trainable=True)(pos)


# x = Concatenate()([x, pos_emb])

filter_sizes = [3,5]
convs = []
for filter_size in filter_sizes:
    conv = Conv1D(filters=128, kernel_size= kernel_size, padding='same', activation='relu')(x)
    convs.append(conv)
    
    
merged = Concatenate(axis=-1)(convs)

x = Dropout(0.55)(merged)
x = Conv1D(filters = 256, kernel_size = kernel_size, padding='same', activation='relu')(x)
x = Conv1D(filters = 256, kernel_size = kernel_size, padding='same')(x)
x = Conv1D(filters = 256, kernel_size = kernel_size, padding='same')(x)


# Finally compute the probabilities
preds = Dense(3, activation='softmax')(x)

# Specify the input and the output
model = Model(sent, preds)
model.compile(loss='categorical_crossentropy', 
              optimizer=Adam(lr=0.0001), metrics=['acc', f1])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sentence (InputLayer)           (None, 83)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 83, 300)      1562700     sentence[0][0]                   
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 83, 128)      192128      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 83, 128)      192128      embedding_1[0][0]                
__________________________________________________________________________________________________
concatenat

In [49]:
from keras.utils.vis_utils import plot_model
from keras import backend as K
K.clear_session()

plot_model(model, to_file="DoubleCNN_keras.png", show_shapes=True, show_layer_names=True)

In [41]:
model.fit(X_train, y_train, batch_size=128, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1932276a90>

In [42]:
y_pred = model.predict(X_test)

In [43]:
def flatten_3d_label(y):
    return y.reshape((y.shape[0]*y.shape[1], y.shape[2]))

In [44]:
def drop_padded(y_test_flat, y_pred_flat):
    idx = np.where(y_test_flat.any(axis=1))[0]
    
    return y_test_flat[idx, :], y_pred_flat[idx, :]

In [45]:
def evaluate(y_test, y_pred, print_report=True):
    y_test_flat = flatten_3d_label(y_test)
    y_pred_flat = flatten_3d_label(y_pred)
    
    y_test_eval, y_pred_eval = drop_padded(y_test_flat, y_pred_flat)
    y_test_eval, y_pred_eval = y_test_eval.argmax(axis=1), y_pred_eval.argmax(axis=1)
    
    if print_report:
        print(classification_report(y_test_eval, y_pred_eval, 
                            labels=[1, 2, 0],target_names=['B', 'I', 'O']))
        
    
    return y_test_eval, y_pred_eval

    

In [46]:
evaluate(y_test, y_pred)

              precision    recall  f1-score   support

           B       0.75      0.87      0.81      1132
           I       0.78      0.67      0.72       571
           O       0.98      0.97      0.98     11049

    accuracy                           0.95     12752
   macro avg       0.84      0.83      0.83     12752
weighted avg       0.95      0.95      0.95     12752



(array([0, 1, 0, ..., 1, 2, 0]), array([0, 1, 0, ..., 0, 1, 0]))