In [1]:
import os
import re
from collections import namedtuple, Counter
from multiprocessing.pool import Pool

import numpy as np
import pandas as pd
import spacy
import tflearn
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from script import *

hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience)


In [2]:
print('loading data')
train_df = load_data('train.csv')

loading data
cleaned text data


In [3]:
tf_idf = TfidfVectorizer(
    stop_words=None,
    tokenizer=tokenize,
    preprocessor=None,
    sublinear_tf=True,
    use_idf=False,
    lowercase=True,
    min_df=4
)
documents = tf_idf.fit_transform(train_df['comment_text'])
labels = train_df.drop(['id', 'comment_text'], axis=1)
train_x, test_x, train_y, test_y = train_test_split(documents, labels, test_size=0.3)
test_x = np.array(test_x.toarray())
vocab_size = len(tf_idf.vocabulary_)

learning_rate = 0.01
epochs = 10
nclasses = 6


In [37]:
def batchify(x, y, batch_size=64):
    nsamples = x.shape[0]
    index = 0
    for offset in range(0, nsamples, batch_size):
        index += 1
        yield index, np.array(x[offset: batch_size + offset].toarray()), y[offset: batch_size + offset]
        
batch_size = 128

In [38]:
graph = tf.Graph()
with graph.as_default():
    initializer = xavier_initializer(dtype=tf.float32)
    x = tf.placeholder(tf.float32, [None, vocab_size], 'input')
    y = tf.placeholder(tf.float32, [None, nclasses], 'input_labels')
    logits = tf.contrib.layers.fully_connected(x, nclasses, tf.nn.sigmoid)
    (_, auc_update_op) = tf.metrics.auc(y, logits, curve='ROC')
    loss = tf.losses.sigmoid_cross_entropy(y, logits)
    tf.summary.scalar('loss', loss)
    tf.summary.scalar('roc-auc', auc_update_op)
    
    optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)
    
    with tf.Session(graph=graph) as sess:
        merged = tf.summary.merge_all()
        batch_writer = tf.summary.FileWriter('./logs/batch')
        epoch_writer = tf.summary.FileWriter('./logs/epoch')
        sess.run(tf.initialize_all_variables())
        total = int(train_x.shape[0] / batch_size)
        for epoch in range(epochs):
            accumulated_loss = 0
            sess.run(tf.initialize_local_variables())
            for batch_index, input_x, input_y in batchify(train_x, train_y, batch_size=batch_size):
                feed_dict = {
                    x: np.array(input_x),
                    y: np.array(input_y, dtype=np.float32)
                }
                summaries, _, total_loss_, train_auc = sess.run([merged, optimizer, loss, auc_update_op], feed_dict=feed_dict)
                print("epoch: {0}/{1}".format(epoch + 1, epochs))
                print("batch loss {0}".format(total_loss_))
                print("batch auc {0}".format(train_auc))
                print("batch: {0}/{1}".format(batch_index + 1, total))
                print("-----------------")
                batch_writer.add_summary(summaries, epoch * total + batch_index)
                sys.stdout.flush()

            summaries, epoch_test_loss, test_auc = sess.run(
                [merged, loss, auc_update_op], feed_dict={
                    x: test_x,
                    y: test_y
                }
            )
            epoch_writer.add_summary(summaries, (epoch + 1) * total)
            print("epoch: {0}/{1}".format(epoch + 1, epochs))
            print("val loss: {}".format(epoch_test_loss))
            print("val auc: {}".format(test_auc))
            print("-----------------")
            sys.stdout.flush()
            print("Epoch finished: {}".format(datetime.datetime.now().time()))
            print("=================")

epoch: 1/10
batch loss 0.9616785049438477
batch auc 0.549645185470581
batch: 2/872
-----------------
epoch: 1/10
batch loss 0.9536888599395752
batch auc 0.5258209705352783
batch: 3/872
-----------------
epoch: 1/10
batch loss 0.9536622166633606
batch auc 0.5080333352088928
batch: 4/872
-----------------
epoch: 1/10
batch loss 0.9534808993339539
batch auc 0.5054898858070374
batch: 5/872
-----------------
epoch: 1/10
batch loss 0.9546634554862976
batch auc 0.514611005783081
batch: 6/872
-----------------
epoch: 1/10
batch loss 0.9435660243034363
batch auc 0.5192984342575073
batch: 7/872
-----------------
epoch: 1/10
batch loss 0.9550978541374207
batch auc 0.5248149037361145
batch: 8/872
-----------------
epoch: 1/10
batch loss 0.9530219435691833
batch auc 0.534184455871582
batch: 9/872
-----------------
epoch: 1/10
batch loss 0.9464821219444275
batch auc 0.5335862636566162
batch: 10/872
-----------------
epoch: 1/10
batch loss 0.9547416567802429
batch auc 0.5346458554267883
batch: 11/872