goal:
=====

multiclass logistic regression with the inner product replaced by a more general kernal function

In [1]:
# import this stuff
import time
import sys
from pylab import *
from scipy import sparse
import numpy as np

import tensorflow as tf

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import datasets
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [136]:
def factorize(observed_features,
              labels,
              observed_features_validation,
              labels_validation,
              max_iter=100,
              verbose=False,
              lambda_v=0,
              kernel='linear',
              epsilon=0.0001,
              optimizer=tf.train.AdamOptimizer(),
              seed=12345):

    # Extract info about shapes etc from the training data
    num_items = observed_features.shape[0]
    num_features = observed_features.shape[1]
    num_classes = labels.shape[1]
 
    v = tf.Variable(tf.truncated_normal([num_classes, num_features], stddev=0.2, mean=0, seed=seed), name="hyperplane")
    
    x = tf.placeholder(tf.float32, [1, num_features])
    y = tf.placeholder(tf.float32, [1, num_classes])
    x_norm = tf.nn.l2_normalize(x,dim=0)
    
    if kernel=='linear':
        ip = tf.matmul(v, tf.transpose(x))
    elif kernel=='gaussian':
        ip = tf.reshape(tf.exp(-tf.reduce_sum(tf.square(tf.sub(v, x)), reduction_indices=1)/10), [num_classes, 1])
    elif kernel=='quadratic':
        ip = (1 + tf.matmul(v, tf.transpose(x)))*(1 + tf.matmul(v, tf.transpose(x)))
    else:
        raise Exception("unknown kernel: %s" % kernel)


    pred = tf.nn.softmax(tf.transpose(ip))
    cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred + 0.0000000001), reduction_indices=1) + # this was causing nans if pred == 0
                          lambda_v*tf.nn.l2_loss(v)) # regularization for v
    
    norm = tf.nn.l2_loss(v)
    optimize = optimizer.minimize(cost)

    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        sess.run(init)
        last_cost = 1000000
        for iter in range(0, max_iter):
            avg_cost = 0
            
            for i in range(num_items):
                _, c, n = sess.run([optimize, cost, norm],
                                   feed_dict={x:observed_features[i].reshape(1,num_features), y:labels[i].reshape(1,num_classes)})
                avg_cost += c / num_items
            if verbose:
                print("epoch: %s, cost: %s, norm: %s" % (iter+1, avg_cost, n))

            # check for convergence
            if abs(avg_cost-last_cost)/avg_cost < epsilon:
                break
                
            last_cost = avg_cost
            
        if verbose:
            print("optimization finished")
        # test prediction
        predictions = []
        total_costs = 0
        for i in range(observed_features_validation.shape[0]):
            p, c = sess.run([pred, cost], feed_dict={x:observed_features_validation[i].reshape(1, num_features), y:labels_validation[i].reshape(1, num_classes)})
            predictions.append(p)
            total_costs += c
        return np.array([z[0,:] for z in predictions]), total_costs/observed_features_validation.shape[0], sess.run([norm])

In [134]:
# use this data for now

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

ng = datasets.fetch_20newsgroups (categories=categories, shuffle=True)
encoder = OneHotEncoder(sparse=False)
labels = encoder.fit_transform(ng.target.reshape(-1,1))

tfidf = TfidfVectorizer(decode_error=False, min_df=5)

X_train, X_test, y_train, y_test = train_test_split(ng.data, labels, test_size=.3)
X_train = tfidf.fit_transform(X_train).todense()
X_test = tfidf.transform(X_test).todense()


In [None]:
predictions, test_costs, norm = factorize(X_train, y_train, X_test, y_test, verbose=True, lambda_v=0.001, max_iter=100, kernel='linear')
print("cost: %s, norm: %s") % (test_costs, norm)
for i in range(y_train.shape[1]):
    print("class %s AUC: %s") % (i, roc_auc_score(y_test[:,i], predictions[:,i]))
print("overall AUC: %s") % roc_auc_score(y_test, predictions, average="weighted")

epoch: 1, cost: 1.31251387962, norm: 125.028
epoch: 2, cost: 0.971380446533, norm: 153.94
epoch: 3, cost: 0.842345043762, norm: 184.889
epoch: 4, cost: 0.786399896987, norm: 206.277
epoch: 5, cost: 0.759373590718, norm: 219.928
epoch: 6, cost: 0.745056213477, norm: 228.823
epoch: 7, cost: 0.736924188774, norm: 234.92
epoch: 8, cost: 0.732061335828, norm: 239.303
epoch: 9, cost: 0.729035899355, norm: 242.564
epoch: 10, cost: 0.727092756566, norm: 245.05
epoch: 11, cost: 0.72581127822, norm: 246.975
epoch: 12, cost: 0.724946805935, norm: 248.485
epoch: 13, cost: 0.724351918588, norm: 249.68
epoch: 14, cost: 0.723935142358, norm: 250.633
epoch: 15, cost: 0.723638324608, norm: 251.396
epoch: 16, cost: 0.723423713291, norm: 252.012
epoch: 17, cost: 0.723266330624, norm: 252.51
epoch: 18, cost: 0.723149394257, norm: 252.916
epoch: 19, cost: 0.723061424388, norm: 253.247
epoch: 20, cost: 0.722994494412, norm: 253.518
optimization finished
cost: 0.786358038189, norm: [253.51556]
class 0 AUC: 0

In [None]:
predictions, test_costs, norm = factorize(X_train, y_train, X_test, y_test, verbose=True, lambda_v=0.001, max_iter=100, kernel='quadratic')
print("cost: %s, norm: %s") % (test_costs, norm)
for i in range(y_train.shape[1]):
    print("class %s AUC: %s") % (i, roc_auc_score(y_test[:,i], predictions[:,i]))
print("overall AUC: %s") % roc_auc_score(y_test, predictions, average="weighted")

epoch: 1, cost: 1.15101790787, norm: 155.938
epoch: 2, cost: 0.546666116485, norm: 160.701
epoch: 3, cost: 0.352679439979, norm: 154.875
epoch: 4, cost: 0.278717816093, norm: 146.835
epoch: 5, cost: 0.243684271399, norm: 141.664
epoch: 6, cost: 0.225549640942, norm: 139.085

In [None]:
predictions, test_costs, norm = factorize(X_train, y_train, X_test, y_test, verbose=True, lambda_v=0.001, max_iter=100, kernel='gaussian')
print("cost: %s, norm: %s") % (test_costs, norm)
for i in range(y_train.shape[1]):
    print("class %s AUC: %s") % (i, roc_auc_score(y_test[:,i], predictions[:,i]))
print("overall AUC: %s") % roc_auc_score(y_test, predictions, average="weighted")