goal:
=====

basically a factorization machine with cross entropy loss where interaction effects come from deep nonlinear relu-activated embeddings and with an additional "metric" kernal matrix.

todo: dropout. currently no regularization on the interaction layers in the cost function. can handle with FTRL optimization

In [1]:
# import this stuff
import time
import sys
from pylab import *
from scipy import sparse
import numpy as np

import tensorflow as tf

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import datasets
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [11]:
def make_embeddings(x, rank, num_features, depth=1, seed=12345):
    """
      assumes that all hidden layers are width `rank`
    """
    assert depth > 0
    V = tf.Variable(tf.truncated_normal([rank, num_features], stddev=0.2, mean=0, seed=seed), name="v_1")
    b = tf.Variable(tf.truncated_normal([rank, 1], stddev=0.2, mean=0, seed=seed), name="b_1")
    Vx = tf.nn.relu(tf.matmul(V, x) + b)
    for i in range(depth - 1):
        V = tf.Variable(tf.truncated_normal([rank, rank], stddev=0.2, mean=0, seed=seed), name="v_%s" % i)
        b = tf.Variable(tf.truncated_normal([rank, 1], stddev=0.2, mean=0, seed=seed), name="b_%s" % i)
        Vx = tf.nn.relu(tf.matmul(V, Vx) + b)

    return Vx

def factorize(observed_features,
              labels,
              observed_features_validation,
              labels_validation,
              rank,
              max_iter=100,
              verbose=False,
              lambda_v=0,
              lambda_k=0,
              lambda_w=0,
              lambda_constants=0,
              epsilon=0.001,
              optimizer=tf.train.AdamOptimizer(),
              depth=3,
              seed=12345):

    # Extract info about shapes etc from the training data
    num_items = observed_features.shape[0]
    num_features = observed_features.shape[1]
    K = tf.Variable(tf.truncated_normal([rank, rank], stddev=0.2, mean=0, seed=seed), name="metric_matrix")
    
    w = tf.Variable(tf.truncated_normal([1, num_features], stddev=0.2, mean=0, seed=seed), name="hyperplane")
    b = tf.Variable(tf.truncated_normal([1, 1], stddev=0.2, mean=0, seed=seed), name="b_one")
    
    x = tf.placeholder(tf.float32, [None, num_features])
    y = tf.placeholder(tf.float32)
    
    norm_x = tf.nn.l2_normalize(x, dim=0)
    
    Vx = make_embeddings(tf.transpose(norm_x), rank, num_features, depth=depth, seed=seed)
    right_kern = tf.matmul(K, Vx)
    
    full_kern = tf.matmul(tf.transpose(Vx), right_kern)
    linear = tf.matmul(w, tf.transpose(norm_x))

    pred = tf.reduce_sum(tf.sigmoid(linear + full_kern + b))
    
    # todo: dropout. currently no regularization on the interaction layers in the cost functino
    # can handle with FTRL optimization
    cost = tf.reduce_mean(-y*tf.log(pred + 0.0000000001) - (1-y)*tf.log((1-pred + 0.0000000001)) + 
            lambda_k*tf.nn.l2_loss(K) +
            lambda_w*tf.nn.l2_loss(w) +
            lambda_constants*tf.nn.l2_loss(b))
    optimize = optimizer.minimize(cost)
    norm = tf.reduce_mean(tf.nn.l2_loss(w))
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        sess.run(init)
        last_cost = 1000000
        for iter in range(0, max_iter):
            avg_cost = 0
            
            for i in range(num_items):
                _, c, n = sess.run([optimize, cost, norm],
                              feed_dict={x:observed_features[i].reshape(1, num_features), y:labels[i]})
                avg_cost += c / num_items
            if verbose:
                print("epoch: %s, cost: %s" % (iter+1, avg_cost))

            # check for convergence
            if abs(avg_cost-last_cost)/avg_cost < epsilon:
                break
                
            last_cost = avg_cost
            
        if verbose:
            print("optimization finished")
        predictions = []
        total_costs = 0
        for i in range(observed_features_validation.shape[0]):
            p, c = sess.run([pred, cost], feed_dict={x:observed_features_validation[i].reshape(1, num_features), y:labels_validation[i]})
            predictions.append(p)
            total_costs += c
        return predictions, total_costs/observed_features_validation.shape[0], sess.run([norm])

In [12]:
# use this data for now

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

ng = datasets.fetch_20newsgroups (categories=categories, shuffle=True)
labels = [1 if y == 2 else 0 for y in ng.target.reshape(-1,1)]

tfidf = TfidfVectorizer(decode_error=False, min_df=5)

X_train, X_test, y_train, y_test = train_test_split(ng.data, labels, test_size=.3)
X_train = tfidf.fit_transform(X_train).todense()
X_test = tfidf.transform(X_test).todense()


In [13]:
r = 10
predictions, test_costs, norm = factorize(X_train, y_train, X_test, y_test, r, verbose=True, lambda_v=0.1, max_iter=300)
print("rank: %s, cost: %s, overall AUC: %s, norm: %s") % (r, test_costs, roc_auc_score(y_test, predictions, average="weighted"), norm)

epoch: 1, cost: 0.33360196478
epoch: 2, cost: 0.0401097349682
epoch: 3, cost: 0.00931769804643
epoch: 4, cost: 0.00247838839455
epoch: 5, cost: 0.000511754474805
epoch: 6, cost: 0.000126325859087
epoch: 7, cost: 4.04279771517e-05
epoch: 8, cost: 1.0098687531e-05
epoch: 9, cost: 2.85424085244e-06
epoch: 10, cost: 8.69213185562e-07
epoch: 11, cost: 2.59260408291e-07
epoch: 12, cost: 8.90110950503e-08
epoch: 13, cost: 3.41245779331e-08
epoch: 14, cost: 1.30231913253e-08
epoch: 15, cost: 5.20927401194e-09
epoch: 16, cost: 1.92516616005e-09
epoch: 17, cost: 6.79470328429e-10
epoch: 18, cost: 2.64238450556e-10
epoch: 19, cost: 7.54967001588e-11
epoch: 20, cost: 7.54967001588e-11
optimization finished
rank: 10, cost: 0.533316462163, overall AUC: 0.96627069576, norm: [111.30756]


## 

In [16]:
# with some regularization via the optimizer
r = 10
predictions, test_costs, norm = factorize(X_train, y_train, X_test, y_test, r, verbose=True, max_iter=300, optimizer=tf.train.FtrlOptimizer(0.1, l2_regularization_strength=0.1))
print("rank: %s, cost: %s, overall AUC: %s, norm: %s") % (r, test_costs, roc_auc_score(y_test, predictions, average="weighted"), norm)

epoch: 1, cost: 0.163082238403
epoch: 2, cost: 0.0300150069038
epoch: 3, cost: 0.020797190683
epoch: 4, cost: 0.0161998612776
epoch: 5, cost: 0.0133454948845
epoch: 6, cost: 0.0113824729583
epoch: 7, cost: 0.00994217756297
epoch: 8, cost: 0.00883659067324
epoch: 9, cost: 0.00795914143351
epoch: 10, cost: 0.0072446552217
epoch: 11, cost: 0.00665089781192
epoch: 12, cost: 0.00614922306421
epoch: 13, cost: 0.00571947609275
epoch: 14, cost: 0.00534703658665
epoch: 15, cost: 0.00502103509521
epoch: 16, cost: 0.00473320190374
epoch: 17, cost: 0.00447714238629
epoch: 18, cost: 0.00424782052873
epoch: 19, cost: 0.00404122097294
epoch: 20, cost: 0.00385409398374
epoch: 21, cost: 0.00368379288721
epoch: 22, cost: 0.00352812143286
epoch: 23, cost: 0.00338526291288
epoch: 24, cost: 0.00325368161432
epoch: 25, cost: 0.00313208631847
epoch: 26, cost: 0.00301937139572
epoch: 27, cost: 0.00291459122074
epoch: 28, cost: 0.00281693069027
epoch: 29, cost: 0.00272568343692
epoch: 30, cost: 0.0026402327833