In [1]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
import json

Using TensorFlow backend.


In [27]:
docs = []
labels = []

In [28]:
def load_data(path1,path2):
    temp_docs = []
    labels = []
    with open(path1) as json_file:
        data = json.load(json_file)
        for x in data:
            temp_docs.append(x['body'])
        len1 = len(temp_docs)
        print('number of human rights docs is: '+str(len1))
        labels = [1]*len1
        
    with open(path2) as json_file:
        data = json.load(json_file)
        for x in data:
            temp_docs.append(x['body'])
        len2 = len(temp_docs)
        print('number of non human rights docs is: '+str(len2-len1))
        labels = labels + [0]*(len2-len1)
    return temp_docs,labels

In [29]:
docs,labels = load_data('/home/tigermlt/CS341/github_repo/CS341/parsed_data.json','/home/tigermlt/CS341/github_repo/CS341/data_non_human_rights.json')

number of human rights docs is: 65630
number of non human rights docs is: 12992


In [30]:
print(len(docs))
print(len(labels))

78622
78622


In [31]:
import random

In [32]:
# random shuffle the data
c = list(zip(docs, labels))

In [33]:
random.shuffle(c)

In [34]:
docs, labels = zip(*c)

In [35]:
docs = list(docs)
labels = list(labels)

In [36]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)

In [37]:
# pad documents to a max length, compute by calculating the maximum document length
max_length = 20512
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [38]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [39]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [53]:
def build_model():
    # define model
    model = Sequential()
    e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False)
    model.add(e)
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

In [54]:
model = build_model()

In [55]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 20512, 300)        65799300  
_________________________________________________________________
flatten_5 (Flatten)          (None, 6153600)           0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 6153601   
Total params: 71,952,901
Trainable params: 6,153,601
Non-trainable params: 65,799,300
_________________________________________________________________
None


In [56]:
# fit the model
model.fit(padded_docs, labels, epochs=1, validation_split = 0.05)

Train on 74690 samples, validate on 3932 samples
Epoch 1/1


<keras.callbacks.History at 0x7f92130ac940>

In [None]:
docs_test = []
labels_test = []

In [None]:
with open('/home/tigermlt/CS341/github_repo/CS341/data/10000.json') as json_file:
    data = json.load(json_file)
    for x in data:
        if x['content'] is not None:
            docs_test.append(x['content'])
    len1 = len(temp_docs)
    print('number of human rights docs is: '+str(len1))
    labels_test = [1]*len1

In [None]:
print(len(docs_test))
print(len(labels_test))

In [None]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
# integer encode the documents
encoded_docs_test = t.texts_to_sequences(docs_test)
# pad documents to a max length, compute by calculating the maximum document length
max_length = 20512
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs_test, labels_test)
print(loss)
print(accuracy)

In [None]:
model.save('binary_classification2.h5')

In [None]:
from keras.models import load_model
# load saved model weights
def load_trained_model(path):
    model = build_model()
    model.load_weights(path)
    return model

In [None]:
model_test = load_trained_model('/home/tigermlt/CS341/wordEmbedding_keras/binary_classification.h5')
# compile the model
model_test.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# evaluate the model
loss, accuracy = model_test.evaluate(padded_docs_test, labels_test)
print(loss)
print(accuracy)