In [129]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

In [108]:
df = pd.read_csv('data/yelp_labelled.txt', names=['sentence', 'label'], sep='\t')

In [109]:
X = df['sentence'].values
y = df['label'].values

In [110]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.25, random_state=100)

## Logistic

In [96]:
vectorizer = TfidfVectorizer()
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

In [97]:
classifier = LogisticRegression()
classifier.fit(train_X, train_y)
score = classifier.score(test_X, test_y)
score

0.82

## Keras

In [98]:
input_dim = train_X.shape[1]

In [99]:
model  = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 10)                17250     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 17,261
Trainable params: 17,261
Non-trainable params: 0
_________________________________________________________________


In [100]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

train_X = convert_sparse_matrix_to_sparse_tensor(train_X)
test_X = convert_sparse_matrix_to_sparse_tensor(test_X)
train_X = tf.sparse.reorder(train_X)
test_X  = tf.sparse.reorder(test_X)
train_y = tf.convert_to_tensor(train_y)
test_y = tf.convert_to_tensor(test_y)

In [101]:
history = model.fit(train_X, train_y,
                   epochs=50,
                   verbose=False,
                   validation_data=(test_X, test_y),
                   batch_size=10)

In [105]:
loss, accuracy = model.evaluate(train_X, train_y, verbose=False)
print(f'Training accuracy {accuracy}')

Training accuracy 1.0


In [106]:
loss, accuracy = model.evaluate(test_X, test_y, verbose=False)
print(f'Training accuracy {accuracy}')

Training accuracy 0.7839999794960022


## Embeddings

In [111]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)

In [112]:
X_train = tokenizer.texts_to_sequences(train_X)
X_test = tokenizer.texts_to_sequences(test_X)

In [114]:
vocab_size = len(tokenizer.word_index) + 1

In [115]:
print(X[2])
print(X_train[2])

Not tasty and the texture was just nasty.
[3, 105, 349, 549, 1, 158, 29, 132, 876, 124, 9, 4, 5, 400, 171]


In [117]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [118]:
X_train[2]

array([  3, 105, 349, 549,   1, 158,  29, 132, 876, 124,   9,   4,   5,
       400, 171,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])

In [119]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,
         output_dim=embedding_dim,
         input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy',
             metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           103600    
_________________________________________________________________
flatten (Flatten)            (None, 5000)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                50010     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 11        
Total params: 153,621
Trainable params: 153,621
Non-trainable params: 0
_________________________________________________________________


In [121]:
history = model.fit(X_train, train_y,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, test_y),
                    batch_size=10)

In [123]:
loss, accuracy = model.evaluate(X_train, train_y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, test_y, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.7920


#### with max pooling

In [124]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,
         output_dim=embedding_dim,
         input_length=maxlen))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy',
             metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           103600    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                510       
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 11        
Total params: 104,121
Trainable params: 104,121
Non-trainable params: 0
_________________________________________________________________


In [125]:
history = model.fit(X_train, train_y,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, test_y),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, train_y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, test_y, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.8360


### Convolution NN

In [126]:
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          207200    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 96, 128)           64128     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 11        
Total params: 272,629
Trainable params: 272,629
Non-trainable params: 0
_________________________________________________________________


In [127]:
history = model.fit(X_train, train_y,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test, test_y),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, train_y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, test_y, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.8080


### Hyperparameter tuning

In [131]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [132]:
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[5000], 
                  embedding_dim=[50],
                  maxlen=[100])

In [133]:
model = KerasClassifier(build_fn=create_model,
                       epochs=epochs,
                       batch_size=10,
                       verbose=False)
grid = RandomizedSearchCV(estimator=model, 
                    param_distributions=param_grid,
                        cv=4,
                         verbose=1, n_iter=5)
grid_result = grid.fit(X_train, train_y)
test_accuracy = grid.score(X_test, test_y)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


In [134]:
grid_result.best_score_

0.7906758338212967

In [135]:
grid_result.best_params_

{'vocab_size': 5000,
 'num_filters': 32,
 'maxlen': 100,
 'kernel_size': 3,
 'embedding_dim': 50}

In [136]:
test_accuracy

0.8240000009536743