goal:
=====

to build a model takes sparse X = x_{i,j} with i in [0,n], j in [0,m] and y, a n-dimensional label vector. we then build a k-rank latent representation of the i's and j's such that we minimize ||y_i - \sum_i u_i * v_j||, an inner product that minimizes loss between an example's label and an inner product between the item's embedding and the embedding induced by all item factors

In [2]:
# import this stuff
import time
import sys
from pylab import *
from scipy import sparse
import numpy as np

import tensorflow as tf

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import datasets
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [127]:
def factorize(observed_features,
              labels,
              observed_features_validation,
              labels_validation,
              rank,
              max_iter=100,
              verbose=False,
              lambda_v=0,
              lambda_k=0,
              lambda_w=0,
              lambda_constants=0,
              epsilon=0.001,
              optimizer=tf.train.AdamOptimizer(),
              seed=12345):

    # Extract info about shapes etc from the training data
    num_items = observed_features.shape[0]
    num_features = observed_features.shape[1]
 
    V = tf.Variable(tf.truncated_normal([rank, num_features], stddev=0.2, mean=0, seed=seed), name="feature_explainers")
    K = tf.Variable(tf.truncated_normal([rank, rank], stddev=0.2, mean=0, seed=seed), name="kernel_matrix")
    

    w = tf.Variable(tf.truncated_normal([1, num_features], stddev=0.2, mean=0, seed=seed), name="hyperplane")
    b_one = tf.Variable(tf.truncated_normal([1, 1], stddev=0.2, mean=0, seed=seed), name="b_one")
    #b_two = tf.Variable(tf.truncated_normal([1, num_classes], stddev=0.2, mean=0, seed=seed), name="b_two")
   

    x = tf.placeholder(tf.float32, [None, num_features])
    y = tf.placeholder(tf.float32)
    
    
    norm_x = tf.nn.l2_normalize(x, dim=0)
    Vx = tf.matmul(V, tf.transpose(norm_x))
    right_kern = tf.matmul(K, Vx)
    
    full_kern = tf.matmul(tf.transpose(Vx), right_kern)    
    linear = tf.matmul(w, tf.transpose(norm_x))
    
    pred = tf.reduce_sum(tf.sigmoid(linear + full_kern + b_one))

    #def gaussian_kernel(tensor_a, a_inputs, tensor_b, b_inputs, gamma):
    #"""Returns the Gaussian kernel matrix of two matrices of vectors
    #element-wise."""
    #cross = cross_matrices(tensor_a, a_inputs, tensor_b, b_inputs)

    #kernel = tf.exp(tf.mul(tf.reduce_sum(tf.square(
    #    tf.sub(cross[0], cross[1])), reduction_indices=2),
    #    tf.neg(tf.constant(gamma, dtype=tf.float32))))

    #return kernel
    
    cost = (-y*tf.log(pred + 0.0000000001) - (1-y)*tf.log((1-pred + 0.0000000001)) + 
            lambda_v*tf.nn.l2_loss(V) +
            lambda_k*tf.nn.l2_loss(K) +
            lambda_w*tf.nn.l2_loss(w) +
            lambda_constants*tf.nn.l2_loss(b_one))
    optimize = optimizer.minimize(cost)
    norm = tf.nn.l2_loss(V)
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        sess.run(init)
        last_cost = 1000000
        for iter in range(0, max_iter):
            avg_cost = 0
            
            for i in range(num_items):
                _, c, n = sess.run([optimize, cost, norm],
                              feed_dict={x:observed_features[i].reshape(1, num_features), y:labels[i]})
                avg_cost += c / num_items
            if verbose:
                print("epoch: %s, cost: %s" % (iter+1, avg_cost))

            # check for convergence
            if abs(avg_cost-last_cost)/avg_cost < epsilon:
                break
                
            last_cost = avg_cost
            
        if verbose:
            print("optimization finished")
        predictions = []
        total_costs = 0
        for i in range(observed_features_validation.shape[0]):
            p, c = sess.run([pred, cost], feed_dict={x:observed_features_validation[i].reshape(1, num_features), y:labels_validation[i]})
            predictions.append(p)
            total_costs += c
        return predictions, test_costs/observed_features_validation.shape[0], sess.run([norm])

In [128]:
# use this data for now

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

ng = datasets.fetch_20newsgroups (categories=categories, shuffle=True)
labels = [1 if y == 2 else 0 for y in ng.target.reshape(-1,1)]

tfidf = TfidfVectorizer(decode_error=False, min_df=5)

X_train, X_test, y_train, y_test = train_test_split(ng.data, labels, test_size=.3)
X_train = tfidf.fit_transform(X_train).todense()
X_test = tfidf.transform(X_test).todense()


In [129]:
r = 10
predictions, test_costs, norm = factorize(X_train, y_train, X_test, y_test, r, verbose=True, lambda_v=0.1, max_iter=30)
print("rank: %s, cost: %s, overall AUC: %s, norm: %s") % (r, test_costs, roc_auc_score(y_test, predictions, average="weighted"), norm)

epoch: 1, cost: 14.7644729747
epoch: 2, cost: 0.990823795838
epoch: 3, cost: 0.429511815442
epoch: 4, cost: 0.210448009151
epoch: 5, cost: 0.0684814785086
epoch: 6, cost: 0.0566134448914
epoch: 7, cost: 0.0190250529797
epoch: 8, cost: 0.0118537945882
epoch: 9, cost: 0.00792866116062
epoch: 10, cost: 0.00540198576812
epoch: 11, cost: 0.00374920841605
epoch: 12, cost: 0.00263545102519
epoch: 13, cost: 0.00187683762442
epoch: 14, cost: 0.00135068240621
epoch: 15, cost: 0.000980452003276
epoch: 16, cost: 0.000716852295156
epoch: 17, cost: 0.000527545649762
epoch: 18, cost: 0.000390287711188
epoch: 19, cost: 0.000290090581925
epoch: 20, cost: 0.000216563647157
epoch: 21, cost: 0.000162370521566
epoch: 22, cost: 0.000122247620128
epoch: 23, cost: 9.24290049427e-05
epoch: 24, cost: 7.01999682665e-05
epoch: 25, cost: 5.35770215421e-05
epoch: 26, cost: 4.11150387887e-05
epoch: 27, cost: 3.1751667582e-05
epoch: 28, cost: 2.46970471439e-05
epoch: 29, cost: 1.93802776262e-05
epoch: 30, cost: 1.536

## 