In [1]:
import os
import re
from collections import namedtuple, Counter
from multiprocessing.pool import Pool

import numpy as np
import pandas as pd
import spacy
import tflearn
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from script import *

hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience)


In [2]:
print('loading data')
train_df = load_data('train.csv')

loading data
cleaned text data


In [3]:
tf_idf = TfidfVectorizer(
    stop_words=None,
    tokenizer=tokenize,
    preprocessor=None,
    sublinear_tf=True,
    use_idf=False,
    lowercase=True,
    min_df=4
)
documents = tf_idf.fit_transform(train_df['comment_text'])
labels = train_df.drop(['id', 'comment_text'], axis=1)
train_x, test_x, train_y, test_y = train_test_split(documents, labels, test_size=0.3)
test_x = np.array(test_x.toarray())
vocab_size = len(tf_idf.vocabulary_)

learning_rate = 0.01
epochs = 10
nclasses = 6


In [24]:
def batchify(x, y, batch_size=64):
    nsamples = x.shape[0]
    index = 0
    for offset in range(0, nsamples, batch_size):
        index += 1
        yield index, np.array(x[offset: batch_size + offset].toarray()), y[offset: batch_size + offset]
        
batch_size = 512

In [None]:
graph = tf.Graph()
with graph.as_default():
    initializer = xavier_initializer(dtype=tf.float32)
    x = tf.placeholder(tf.float32, [None, vocab_size], 'input')
    y = tf.placeholder(tf.float32, [None, nclasses], 'input_labels')
    logits = tf.layers.dense(inputs=x, units=nclasses)
    loss = tf.losses.sigmoid_cross_entropy(y, logits)
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)
    
    with tf.Session(graph=graph) as sess:
        merged = tf.summary.merge_all()
        batch_writer = tf.summary.FileWriter('./logs/batch')
        epoch_writer = tf.summary.FileWriter('./logs/epoch')
        tf.initialize_all_variables().run()
        total = int(train_x.shape[0] / batch_size)
        for epoch in range(epochs):
            accumulated_loss = 0
            for batch_index, input_x, input_y in batchify(train_x, train_y, batch_size=batch_size):
                feed_dict = {
                    x: np.array(input_x),
                    y: np.array(input_y)
                }
                summaries, _, total_loss_, = sess.run([merged, optimizer, loss], feed_dict=feed_dict)
                print("epoch: {0}/{1}".format(epoch + 1, epochs))
                print("batch loss {0}".format(total_loss_))
                print("batch: {0}/{1}".format(batch_index + 1, total))
                print("-----------------")
                batch_writer.add_summary(summaries, epoch * total + batch_index)
                sys.stdout.flush()

            summaries, epoch_test_loss, = sess.run(
                [merged, loss], feed_dict={
                    x: test_x,
                    y: test_y
                }
            )
            epoch_writer.add_summary(summaries, epoch)
            print("epoch: {0}/{1}".format(epoch + 1, epochs))
            print("val loss: {}".format(epoch_test_loss))
            print("-----------------")
            sys.stdout.flush()
            print("Epoch finished: {}".format(datetime.datetime.now().time()))
            print("=================")

epoch: 1/10
batch loss 0.6928598880767822
batch: 2/218
-----------------
epoch: 1/10
batch loss 0.6916782855987549
batch: 3/218
-----------------
epoch: 1/10
batch loss 0.6906533241271973
batch: 4/218
-----------------
epoch: 1/10
batch loss 0.689608097076416
batch: 5/218
-----------------
epoch: 1/10
batch loss 0.6884792447090149
batch: 6/218
-----------------
epoch: 1/10
batch loss 0.6872709393501282
batch: 7/218
-----------------
epoch: 1/10
batch loss 0.6862638592720032
batch: 8/218
-----------------
epoch: 1/10
batch loss 0.6855015754699707
batch: 9/218
-----------------
epoch: 1/10
batch loss 0.6844035983085632
batch: 10/218
-----------------
epoch: 1/10
batch loss 0.6834738850593567
batch: 11/218
-----------------
epoch: 1/10
batch loss 0.6824584007263184
batch: 12/218
-----------------
epoch: 1/10
batch loss 0.6816620230674744
batch: 13/218
-----------------
epoch: 1/10
batch loss 0.6808914542198181
batch: 14/218
-----------------
epoch: 1/10
batch loss 0.6797425746917725
batch

epoch: 1/10
batch loss 0.6240835785865784
batch: 113/218
-----------------
epoch: 1/10
batch loss 0.6257393956184387
batch: 114/218
-----------------
epoch: 1/10
batch loss 0.6234648823738098
batch: 115/218
-----------------
epoch: 1/10
batch loss 0.6237145662307739
batch: 116/218
-----------------
epoch: 1/10
batch loss 0.6234729290008545
batch: 117/218
-----------------
epoch: 1/10
batch loss 0.6212695240974426
batch: 118/218
-----------------
epoch: 1/10
batch loss 0.6219875812530518
batch: 119/218
-----------------
epoch: 1/10
batch loss 0.6220650672912598
batch: 120/218
-----------------
epoch: 1/10
batch loss 0.6200240254402161
batch: 121/218
-----------------
epoch: 1/10
batch loss 0.6195588111877441
batch: 122/218
-----------------
epoch: 1/10
batch loss 0.6203122735023499
batch: 123/218
-----------------
epoch: 1/10
batch loss 0.6216853260993958
batch: 124/218
-----------------
epoch: 1/10
batch loss 0.6193850636482239
batch: 125/218
-----------------
epoch: 1/10
batch loss 0.

epoch: 2/10
batch loss 0.5893001556396484
batch: 2/218
-----------------
epoch: 2/10
batch loss 0.5878915190696716
batch: 3/218
-----------------
epoch: 2/10
batch loss 0.5868710875511169
batch: 4/218
-----------------
epoch: 2/10
batch loss 0.58766108751297
batch: 5/218
-----------------
epoch: 2/10
batch loss 0.5891085267066956
batch: 6/218
-----------------
epoch: 2/10
batch loss 0.5833492279052734
batch: 7/218
-----------------
epoch: 2/10
batch loss 0.5860101580619812
batch: 8/218
-----------------
epoch: 2/10
batch loss 0.5876363515853882
batch: 9/218
-----------------
epoch: 2/10
batch loss 0.5851202607154846
batch: 10/218
-----------------
epoch: 2/10
batch loss 0.5856632590293884
batch: 11/218
-----------------
epoch: 2/10
batch loss 0.5842646956443787
batch: 12/218
-----------------
epoch: 2/10
batch loss 0.5856585502624512
batch: 13/218
-----------------
epoch: 2/10
batch loss 0.5858132839202881
batch: 14/218
-----------------
epoch: 2/10
batch loss 0.5840883851051331
batch:

epoch: 2/10
batch loss 0.5578833222389221
batch: 113/218
-----------------
epoch: 2/10
batch loss 0.5620038509368896
batch: 114/218
-----------------
epoch: 2/10
batch loss 0.5579463243484497
batch: 115/218
-----------------
epoch: 2/10
batch loss 0.5589377880096436
batch: 116/218
-----------------
epoch: 2/10
batch loss 0.5591040849685669
batch: 117/218
-----------------
epoch: 2/10
batch loss 0.5550990104675293
batch: 118/218
-----------------
epoch: 2/10
batch loss 0.5572088360786438
batch: 119/218
-----------------
epoch: 2/10
batch loss 0.5580549240112305
batch: 120/218
-----------------
epoch: 2/10
batch loss 0.5544746518135071
batch: 121/218
-----------------
epoch: 2/10
batch loss 0.5542860627174377
batch: 122/218
-----------------
epoch: 2/10
batch loss 0.5562064051628113
batch: 123/218
-----------------
epoch: 2/10
batch loss 0.5594092011451721
batch: 124/218
-----------------
epoch: 2/10
batch loss 0.5554588437080383
batch: 125/218
-----------------
epoch: 2/10
batch loss 0.

epoch: 3/10
batch loss 0.538675844669342
batch: 2/218
-----------------
epoch: 3/10
batch loss 0.5366336703300476
batch: 3/218
-----------------
epoch: 3/10
batch loss 0.5353325009346008
batch: 4/218
-----------------
epoch: 3/10
batch loss 0.5368154644966125
batch: 5/218
-----------------
epoch: 3/10
batch loss 0.5393785834312439
batch: 6/218
-----------------
epoch: 3/10
batch loss 0.5307703018188477
batch: 7/218
-----------------
epoch: 3/10
batch loss 0.5351234078407288
batch: 8/218
-----------------
epoch: 3/10
batch loss 0.5377225279808044
batch: 9/218
-----------------
epoch: 3/10
batch loss 0.5342082381248474
batch: 10/218
-----------------
epoch: 3/10
batch loss 0.5352749228477478
batch: 11/218
-----------------
epoch: 3/10
batch loss 0.533447802066803
batch: 12/218
-----------------
epoch: 3/10
batch loss 0.535696804523468
batch: 13/218
-----------------
epoch: 3/10
batch loss 0.5361594557762146
batch: 14/218
-----------------
epoch: 3/10
batch loss 0.5337929725646973
batch: 

epoch: 3/10
batch loss 0.5144199728965759
batch: 113/218
-----------------
epoch: 3/10
batch loss 0.5202513337135315
batch: 114/218
-----------------
epoch: 3/10
batch loss 0.5148318409919739
batch: 115/218
-----------------
epoch: 3/10
batch loss 0.5162703990936279
batch: 116/218
-----------------
epoch: 3/10
batch loss 0.516668438911438
batch: 117/218
-----------------
epoch: 3/10
batch loss 0.5112972259521484
batch: 118/218
-----------------
epoch: 3/10
batch loss 0.5143654942512512
batch: 119/218
-----------------
epoch: 3/10
batch loss 0.5157000422477722
batch: 120/218
-----------------
epoch: 3/10
batch loss 0.5109592080116272
batch: 121/218
-----------------
epoch: 3/10
batch loss 0.5109348297119141
batch: 122/218
-----------------
epoch: 3/10
batch loss 0.5136187076568604
batch: 123/218
-----------------
epoch: 3/10
batch loss 0.5180529356002808
batch: 124/218
-----------------
epoch: 3/10
batch loss 0.5128712058067322
batch: 125/218
-----------------
epoch: 3/10
batch loss 0.5

epoch: 4/10
batch loss 0.5021196007728577
batch: 2/218
-----------------
epoch: 4/10
batch loss 0.49954068660736084
batch: 3/218
-----------------
epoch: 4/10
batch loss 0.4980049133300781
batch: 4/218
-----------------
epoch: 4/10
batch loss 0.4999978542327881
batch: 5/218
-----------------
epoch: 4/10
batch loss 0.5034002661705017
batch: 6/218
-----------------
epoch: 4/10
batch loss 0.4925649166107178
batch: 7/218
-----------------
epoch: 4/10
batch loss 0.49819013476371765
batch: 8/218
-----------------
epoch: 4/10
batch loss 0.501498281955719
batch: 9/218
-----------------
epoch: 4/10
batch loss 0.4972020089626312
batch: 10/218
-----------------
epoch: 4/10
batch loss 0.49864181876182556
batch: 11/218
-----------------
epoch: 4/10
batch loss 0.49647703766822815
batch: 12/218
-----------------
epoch: 4/10
batch loss 0.49934086203575134
batch: 13/218
-----------------
epoch: 4/10
batch loss 0.5000172853469849
batch: 14/218
-----------------
epoch: 4/10
batch loss 0.49713134765625
ba

epoch: 4/10
batch loss 0.48116663098335266
batch: 113/218
-----------------
epoch: 4/10
batch loss 0.48838213086128235
batch: 114/218
-----------------
epoch: 4/10
batch loss 0.4818257987499237
batch: 115/218
-----------------
epoch: 4/10
batch loss 0.48361244797706604
batch: 116/218
-----------------
epoch: 4/10
batch loss 0.484188437461853
batch: 117/218
-----------------
epoch: 4/10
batch loss 0.4776923656463623
batch: 118/218
-----------------
epoch: 4/10
batch loss 0.48152264952659607
batch: 119/218
-----------------
epoch: 4/10
batch loss 0.4832368791103363
batch: 120/218
-----------------
epoch: 4/10
batch loss 0.4775411784648895
batch: 121/218
-----------------
epoch: 4/10
batch loss 0.477642297744751
batch: 122/218
-----------------
epoch: 4/10
batch loss 0.4809230864048004
batch: 123/218
-----------------
epoch: 4/10
batch loss 0.4863344430923462
batch: 124/218
-----------------
epoch: 4/10
batch loss 0.48014989495277405
batch: 125/218
-----------------
epoch: 4/10
batch loss

Epoch finished: 15:09:00.407509
epoch: 5/10
batch loss 0.4731987416744232
batch: 2/218
-----------------
epoch: 5/10
batch loss 0.47016096115112305
batch: 3/218
-----------------
epoch: 5/10
batch loss 0.4684186279773712
batch: 4/218
-----------------
epoch: 5/10
batch loss 0.4708259105682373
batch: 5/218
-----------------
epoch: 5/10
batch loss 0.4749206006526947
batch: 6/218
-----------------
epoch: 5/10
batch loss 0.4622136056423187
batch: 7/218
-----------------
epoch: 5/10
batch loss 0.4688901901245117
batch: 8/218
-----------------
epoch: 5/10
batch loss 0.47277721762657166
batch: 9/218
-----------------
epoch: 5/10
batch loss 0.46781912446022034
batch: 10/218
-----------------
epoch: 5/10
batch loss 0.46955880522727966
batch: 11/218
-----------------
epoch: 5/10
batch loss 0.46711182594299316
batch: 12/218
-----------------
epoch: 5/10
batch loss 0.4704795777797699
batch: 13/218
-----------------
epoch: 5/10
batch loss 0.4713209569454193
batch: 14/218
-----------------
epoch: 5/

epoch: 5/10
batch loss 0.4611947536468506
batch: 112/218
-----------------
epoch: 5/10
batch loss 0.45411697030067444
batch: 113/218
-----------------
epoch: 5/10
batch loss 0.46251606941223145
batch: 114/218
-----------------
epoch: 5/10
batch loss 0.4549715518951416
batch: 115/218
-----------------
epoch: 5/10
batch loss 0.45705389976501465
batch: 116/218
-----------------
epoch: 5/10
batch loss 0.4577809274196625
batch: 117/218
-----------------
epoch: 5/10
batch loss 0.4503210484981537
batch: 118/218
-----------------
epoch: 5/10
batch loss 0.45478859543800354
batch: 119/218
-----------------
epoch: 5/10
batch loss 0.4568193852901459
batch: 120/218
-----------------
epoch: 5/10
batch loss 0.4503035545349121
batch: 121/218
-----------------
epoch: 5/10
batch loss 0.45050737261772156
batch: 122/218
-----------------
epoch: 5/10
batch loss 0.4542856514453888
batch: 123/218
-----------------
epoch: 5/10
batch loss 0.4605213701725006
batch: 124/218
-----------------
epoch: 5/10
batch lo

epoch: 5/10
val loss: 0.4446912705898285
-----------------
Epoch finished: 15:10:24.055268
epoch: 6/10
batch loss 0.44926539063453674
batch: 2/218
-----------------
epoch: 6/10
batch loss 0.44583067297935486
batch: 3/218
-----------------
epoch: 6/10
batch loss 0.44389984011650085
batch: 4/218
-----------------
epoch: 6/10
batch loss 0.4466587007045746
batch: 5/218
-----------------
epoch: 6/10
batch loss 0.45134928822517395
batch: 6/218
-----------------
epoch: 6/10
batch loss 0.43701115250587463
batch: 7/218
-----------------
epoch: 6/10
batch loss 0.4445957839488983
batch: 8/218
-----------------
epoch: 6/10
batch loss 0.4489801824092865
batch: 9/218
-----------------
epoch: 6/10
batch loss 0.44343921542167664
batch: 10/218
-----------------
epoch: 6/10
batch loss 0.4454331696033478
batch: 11/218
-----------------
epoch: 6/10
batch loss 0.44274115562438965
batch: 12/218
-----------------
epoch: 6/10
batch loss 0.446548193693161
batch: 13/218
-----------------
epoch: 6/10
batch loss 

epoch: 6/10
batch loss 0.4384298324584961
batch: 111/218
-----------------
epoch: 6/10
batch loss 0.4392150640487671
batch: 112/218
-----------------
epoch: 6/10
batch loss 0.43134450912475586
batch: 113/218
-----------------
epoch: 6/10
batch loss 0.44078636169433594
batch: 114/218
-----------------
epoch: 6/10
batch loss 0.4323618412017822
batch: 115/218
-----------------
epoch: 6/10
batch loss 0.43470725417137146
batch: 116/218
-----------------
epoch: 6/10
batch loss 0.43556854128837585
batch: 117/218
-----------------
epoch: 6/10
batch loss 0.427262544631958
batch: 118/218
-----------------
epoch: 6/10
batch loss 0.43227827548980713
batch: 119/218
-----------------
epoch: 6/10
batch loss 0.43458202481269836
batch: 120/218
-----------------
epoch: 6/10
batch loss 0.4273446500301361
batch: 121/218
-----------------
epoch: 6/10
batch loss 0.4276345670223236
batch: 122/218
-----------------
epoch: 6/10
batch loss 0.431841641664505
batch: 123/218
-----------------
epoch: 6/10
batch los