goal:
=====

takes the latent factor logistic a bit farther in the direction of deep learning, adding biases and nonlinearites at each step.

In [1]:
# import this stuff
import time
import sys
from pylab import *
from scipy import sparse
import numpy as np

import tensorflow as tf

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import datasets
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [None]:
def factorize(observed_features,
              labels,
              observed_features_validation,
              labels_validation,
              rank,
              max_iter=100,
              batch_size = 100,
              verbose=False,
              lambda_v=0,
              lambda_u=0,
              lambda_constants=0,
              epsilon=0.001,
              optimizer=tf.train.AdamOptimizer(),
              seed=12345):

    # Extract info about shapes etc from the training data
    num_items = observed_features.shape[0]
    num_features = observed_features.shape[1]
    num_classes = labels.shape[1]
 
    U = tf.Variable(tf.truncated_normal([rank, num_features], stddev=0.2, mean=0, seed=seed), name="item_explainers")
    v_prime = tf.Variable(tf.truncated_normal([num_classes, rank], stddev=0.2, mean=0, seed=seed), name="hyperplane")
    b_one = tf.Variable(tf.truncated_normal([rank, 1], stddev=0.2, mean=0, seed=seed), name="b_one")
    b_two = tf.Variable(tf.truncated_normal([1, num_classes], stddev=0.2, mean=0, seed=seed), name="b_two")
   

    x = tf.placeholder(tf.float32, [None, num_features])
    y = tf.placeholder(tf.float32, [None, num_classes])
                           
    pred = tf.nn.softmax(b_two + tf.transpose(tf.matmul(v_prime, tf.nn.tanh(tf.matmul(U, tf.transpose(tf.nn.l2_normalize(x, dim=0)))) + b_one)))
    
    cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred + 0.0000000001), reduction_indices=1) + # this was causing nans if pred == 0
                          lambda_v*tf.nn.l2_loss(v_prime)  + # regularization for v
                          lambda_u*tf.nn.l2_loss(U) + # regularization for U
                          lambda_constants*(tf.nn.l2_loss(b_one) + tf.nn.l2_loss(b_two)))
    norm = tf.nn.l2_loss(v_prime)
    optimize = optimizer.minimize(cost)
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        sess.run(init)
        last_cost = 1000000
        for iter in range(0, max_iter):
            avg_cost = 0
            batches = int(np.ceil(num_items/batch_size))
            xs = np.array_split(observed_features, batches)
            ys = np.array_split(labels, batches)
            
            for i in range(batches):
                _, c, n = sess.run([optimize, cost, norm],
                                   feed_dict={x:xs[i], y:ys[i]})
                avg_cost += c / xs[i].shape[0]
            if verbose:
                print("epoch: %s, cost: %s, norm: %s" % (iter+1, avg_cost, n))

            # check for convergence
            if abs(avg_cost-last_cost)/avg_cost < epsilon:
                break
                
            last_cost = avg_cost
            
        if verbose:
            print("optimization finished")
        # test prediction
        predictions, test_costs, norm = sess.run([pred, cost, norm], feed_dict={x:observed_features_validation, y:labels_validation})
        return predictions, test_costs, norm

In [3]:
# use this data for now

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

ng = datasets.fetch_20newsgroups (categories=categories, shuffle=True)
encoder = OneHotEncoder(sparse=False)
labels = encoder.fit_transform(ng.target.reshape(-1,1))

tfidf = TfidfVectorizer(decode_error=False, min_df=5)

X_train, X_test, y_train, y_test = train_test_split(ng.data, labels, test_size=.3)
X_train = tfidf.fit_transform(X_train).todense()
X_test = tfidf.transform(X_test).todense()


In [6]:
r = 10
predictions, test_costs, norm = factorize(X_train, y_train, X_test, y_test, r, verbose=True, lambda_v=0.5, max_iter=300)
print("rank: %s, cost: %s, norm: %s") % (r, test_costs, norm)
for i in range(y_train.shape[1]):
    print("class %s AUC: %s") % (i, roc_auc_score(y_test[:,i], predictions[:,i]))
print("overall AUC: %s") % roc_auc_score(y_test, predictions, average="weighted")

epoch: 1, cost: 0.251515544714, norm: 0.571562
epoch: 2, cost: 0.224982089099, norm: 0.50757
epoch: 3, cost: 0.206132943847, norm: 0.459181
epoch: 4, cost: 0.191209653403, norm: 0.426653
epoch: 5, cost: 0.178604317076, norm: 0.408867
epoch: 6, cost: 0.167633029543, norm: 0.403772
epoch: 7, cost: 0.158005029011, norm: 0.40868
epoch: 8, cost: 0.149565987274, norm: 0.42065
epoch: 9, cost: 0.142213617952, norm: 0.436864
epoch: 10, cost: 0.13585215016, norm: 0.454891
epoch: 11, cost: 0.130408114779, norm: 0.472859
epoch: 12, cost: 0.125762103392, norm: 0.489517
epoch: 13, cost: 0.121801529044, norm: 0.504194
epoch: 14, cost: 0.118422362473, norm: 0.516653
epoch: 15, cost: 0.11552977887, norm: 0.526939
epoch: 16, cost: 0.113039929489, norm: 0.53525
epoch: 17, cost: 0.110885585399, norm: 0.541849
epoch: 18, cost: 0.109014151419, norm: 0.547007
epoch: 19, cost: 0.107373186718, norm: 0.550973
epoch: 20, cost: 0.105921770219, norm: 0.553978
epoch: 21, cost: 0.104630345037, norm: 0.55623
epoch: 2

## 