In [50]:
import pandas as pd, numpy as np, tensorflow as tf, gensim as gs, string, nltk, re, operator
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from matplotlib import pyplot
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adamnoack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load the training and test data

In [22]:
# there are no labels on the test data (thanks, kaggle)
test = pd.read_csv('all/test.tsv', sep='\t')
data = pd.read_csv('all/train.tsv', sep='\t')
# print first elements of training data
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


## Clean data
Remove sentences with neutral sentiment and merge ouput classes 0 and 1, and merge output classes 3 and 4 to make the classification task binary.
Remove punctuation, make all lowercase. 

In [None]:
# remove sentences with neutral sentiment
data = data[data.Sentiment != 2]

# make the classification task binary
data.loc[data.Sentiment < 2, 'Sentiment'] = 0
data.loc[data.Sentiment > 2, 'Sentiment'] = 1

print("Total samples: {}".format(data.index.size))
print("Positive {}".format(data[data['Sentiment'] == 1].index.size))
print("Negative {}".format(data[data['Sentiment'] == 0].index.size))

In [26]:
data['Phrase'] = data['Phrase'].apply(lambda x: x.lower())
data['Phrase'] = data['Phrase'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

## Tokenize the data and split into train and test sets

In [30]:
# max_f: the maximum number of words to keep, based on word frequency. Only the most common num_words words will be kept.
max_f = 2000
tokenizer = Tokenizer(num_words=max_f, split=' ')
tokenizer.fit_on_texts(data['Phrase'].values)
X = tokenizer.texts_to_sequences(data['Phrase'].values)
X = pad_sequences(X)

In [44]:
# the lower the value that the tokenizer assigns to a word, the more often that word appears in the corpus
# test_sent = ['hello hello hello hello', 'there', 'adam adam', 'world world world', 'is is is']
# tk = Tokenizer(split=' ')
# tk.fit_on_texts(test_sent)
# tk.texts_to_sequences(test_sent)
# hello:1, world:2, is:3, adam:4, there:5

In [139]:
# ensure Y values are categorical
Y = pd.get_dummies(data['Sentiment']).values
# split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)

In [140]:
# split some of the test data off and make validation set
validation_size = 8000
X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

# sanity check
print("Training data shape: {}".format((X_train.shape,Y_train.shape)))
print("Validation data shape: {}".format((X_validate.shape,Y_validate.shape)))
print("Testing data shape: {}".format((X_test.shape,Y_test.shape)))

Training data shape: ((61182, 43), (61182, 2))
Validation data shape: ((8000, 43), (8000, 2))
Testing data shape: ((7296, 43), (7296, 2))


## Build the model

In [33]:
embed_dim = 128
lstm_out = 196

model = Sequential()
# instead of embedding the words in the data preprocessing stage, create an embedding layer
model.add(Embedding(max_f, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 43, 128)           256000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 43, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


## Train the model

In [34]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 11, validation_data=(X_validate, Y_validate), batch_size=batch_size, verbose = 1)

Train on 61182 samples, validate on 8000 samples
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.callbacks.History at 0x1a3e7f1978>

## Save model architecture to json file and model parameters to h5py file

In [35]:
# save model to JSON file
model_json = model.to_json()
with open("lstm_model.json", "w") as json_file:
    json_file.write(model_json)
# save weights to HDF5 file
model.save_weights("lstm_model.h5")
print("Saved model to disk")

Saved model to disk


## Load model

In [38]:
# load json and create model
json_file = open('lstm_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("lstm_model.h5")
print("Loaded model from disk")

Loaded model from disk


## Measure performance of model on test data

In [41]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

# make sure loaded model is identical to working model
loaded_model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
score,acc = loaded_model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.32
acc: 0.86
score: 0.32
acc: 0.86


## Anchors

Anchors are chosen based on their precision, or their ability to predict a particular output class, and their coverage: $cov(A) = E _{D(z)}[A(z)]$.

Their precision can be estimated by finding sample inputs $x_{i}$ with the an anchor, i.e. $A(x_{i})$, and finding the percentage of the time that $f(x_{i}) = c_{i}$ for a particular class of output, $c_{i}$.

The index that the Tokenizer has assigned to a particular word can serve as a proxy for the coverage.

In [106]:
words = list(range(1, max_f+1))
stopwords = nltk.corpus.stopwords.words('english')
stopwords_nums = []
for word in stopwords:
    try:
        stopwords_nums += [tokenizer.word_index[word]]
    except KeyError:
        pass
most_used_words = [word for word in words if word not in stopwords_nums]

In [161]:
def generate_candidates(A, hi_freq_wds, index):
    for i in range(len(A)):
        A[i] += [hi_freq_wds[index]]
        index += 1
    return A

def test_candidates(A, data_x, data_y):
    prec = []
    for a in A:
        indices = get_samples_w_anchor(a, data_x)
        if len(indices) > 0:
            pred = model.predict(data_x[indices])
            true = data_y[indices]
            prec.append(1 - np.sum(np.square(pred-true))/pred.shape[0])
        else:
            prec.append(-1)
    # return the anchor that has the highest precision
    return A.index(max(prec))
        
def get_samples_w_anchor(a, data):
    indices = []
    for i in range(data.shape[0]):
        present = True
        for word in a:
            if word not in data[i]:
                present = False
                continue
        if present:
            indices.append(i)
    return indices

## Test functions out

In [165]:
arr = np.array([[1,2,3,4], [5,6,7], [8,2,4,6], [4,6], [3,8,9]])
arr[get_samples_w_anchor([3], arr)]

array([list([1, 2, 3, 4]), list([3, 8, 9])], dtype=object)

In [179]:
a = np.array([1,0,0,0,1,1])
b = np.array([1,0,0,0,1,1])
1- np.sum(np.square(b-a))/a.shape[0]

1.0

In [160]:
X_test

array([[   0,    0,    0, ...,    0,    0,  156],
       [   0,    0,    0, ...,   34,   13, 1005],
       [   0,    0,    0, ...,    0,  330,  821],
       ...,
       [   0,    0,    0, ...,  945,    3, 1151],
       [   0,    0,    0, ...,    4,  326,  269],
       [   0,    0,    0, ...,    9,  204,  471]], dtype=int32)

In [148]:
l = [1,2,3,4]
for z in range(3):
    for i in l:
        if i != 1:
            continue
        else:
            print(i)

1
1
1


In [182]:
l.index(max(l))

5

In [181]:
l.append(9)

In [150]:
l

[1, 2, 3, 4, 5]

In [133]:
X_test

array([[   0,    0,    0, ...,    0,    0,  156],
       [   0,    0,    0, ...,   34,   13, 1005],
       [   0,    0,    0, ...,    0,  330,  821],
       ...,
       [   0,    0,    0, ...,  945,    3, 1151],
       [   0,    0,    0, ...,    4,  326,  269],
       [   0,    0,    0, ...,    9,  204,  471]], dtype=int32)

In [134]:
df_test = pd.DataFrame(X_test)

In [135]:
Y_test = np.argmax(Y_test, axis=1)

In [138]:
df_test['sentiment'] = Y_test
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,sentiment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,156,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,34,13,1005,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,330,821,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,13,41,1
4,0,0,0,0,0,0,0,0,0,0,...,552,64,3,1,1,345,4,43,119,0
5,0,0,0,0,0,0,0,0,0,0,...,95,1,170,470,9,510,124,2,415,1
6,0,0,0,0,0,0,0,0,0,0,...,110,1424,36,1847,35,1,6,75,64,1
7,0,0,0,0,0,0,0,0,0,0,...,11,2,655,228,441,36,2,850,12,0
8,0,0,0,0,0,0,0,0,0,0,...,14,8,7,13,1230,594,3,1334,254,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,6,5,1,109,1
