In [67]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [35]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
import json
import tensorflow as tf
from keras.callbacks import CSVLogger
from keras.optimizers import SGD,Adam

In [2]:
from tensorflow.python.client import device_lib

In [3]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12977240672257284296
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15771998618
locality {
  bus_id: 1
}
incarnation: 12971880801161676139
physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0"
]


In [4]:
docs = []
labels = []

In [5]:
def load_data(path1,path2):
    temp_docs = []
    labels = []
    with open(path1) as json_file:
        data = json.load(json_file)
        for x in data:
            temp_docs.append(x['body'])
        len1 = len(temp_docs)
        print('number of human rights docs is: '+str(len1))
        labels = [1]*len1
        
    with open(path2) as json_file:
        data = json.load(json_file)
        for x in data:
            temp_docs.append(x['body'])
        len2 = len(temp_docs)
        print('number of non human rights docs is: '+str(len2-len1))
        labels = labels + [0]*(len2-len1)
    return temp_docs,labels

In [6]:
docs,labels = load_data('/home/tigermlt/CS341/github_repo/CS341/parsed_data.json','/home/tigermlt/CS341/data/data_non_human_rights2.json')

number of human rights docs is: 65630
number of non human rights docs is: 63183


In [7]:
print(len(docs))
print(len(labels))

128813
128813


In [8]:
import random

In [9]:
# random shuffle the data
c = list(zip(docs, labels))

In [10]:
random.shuffle(c)

In [11]:
docs, labels = zip(*c)

In [12]:
docs = list(docs)
labels = list(labels)

In [13]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)

In [14]:
# pad documents to a max length, compute by calculating the maximum document length
max_length = 20512
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [15]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [16]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
def build_model():
    # define model
    model = Sequential()
    e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False)
    model.add(e)
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

In [40]:
model = build_model()

In [41]:
csv_logger=CSVLogger('bnonb_v1.csv',append=True,separator=';')

In [42]:
# compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20512, 300)        107652300 
_________________________________________________________________
flatten_5 (Flatten)          (None, 6153600)           0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 6153601   
Total params: 113,805,901
Trainable params: 6,153,601
Non-trainable params: 107,652,300
_________________________________________________________________
None


In [44]:
with tf.device('/device:GPU:0'):
    # fit the model
    model.fit(padded_docs, labels, epochs=10, validation_split = 0.15,batch_size = 128,callbacks=[csv_logger])

Train on 109491 samples, validate on 19322 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
docs_test = []
labels_test = []

In [46]:
with open('/home/tigermlt/CS341/github_repo/CS341/data/10000.json') as json_file:
    data = json.load(json_file)
    for x in data:
        if x['content'] is not None:
            docs_test.append(x['content'])
    len1 = len(docs_test)
    print('number of human rights docs is: '+str(len1))
    labels_test = [1]*len1

number of human rights docs is: 7940


In [47]:
print(len(docs_test))
print(len(labels_test))

7940
7940


In [51]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
# integer encode the documents
encoded_docs_test = t.texts_to_sequences(docs_test)
# pad documents to a max length, compute by calculating the maximum document length
max_length = 20512
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')

In [55]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs_test, labels_test,batch_size = 128)
print(loss)
print(accuracy)

0.231481399817
0.939773299748


In [75]:
model.save('binary_classification2.h5')

In [None]:
from keras.models import load_model
# load saved model weights
def load_trained_model(path):
    model = build_model()
    model.load_weights(path)
    return model

In [None]:
model_test = load_trained_model('/home/tigermlt/CS341/wordEmbedding_keras/binary_classification.h5')
# compile the model
model_test.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# evaluate the model
loss, accuracy = model_test.evaluate(padded_docs_test, labels_test)
print(loss)
print(accuracy)