In [1]:
import numpy
import urllib
import scipy.optimize
import random
from math import exp
from math import log
from sklearn.decomposition import PCA

random.seed(0)

print "Reading data..."
dataFile = open("winequality-white.csv")
header = dataFile.readline()
fields = ["constant"] + header.strip().replace('"','').split(';')
featureNames = fields[:-1]
labelName = fields[-1]
lines = [[1.0] + [float(x) for x in l.split(';')] for l in dataFile]

X = [l[:-1] for l in lines]
y = [l[-1] > 5 for l in lines]

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + exp(-x))

def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  # for debugging
  # print "ll =", loglikelihood
  return -loglikelihood

def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      dl[k] += X[i][k] * (1 - sigmoid(logit))
      if not y[i]:
        dl[k] -= X[i][k]
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])

X_train = X[:int(len(X)/3)]
y_train = y[:int(len(y)/3)]
X_validate = X[int(len(X)/3):int(2*len(X)/3)]
y_validate = y[int(len(y)/3):int(2*len(y)/3)]
X_test = X[int(2*len(X)/3):]
y_test = y[int(2*len(X)/3):]

def train(lam):
  theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
  return theta

# End of setup code

# 1
def performance(theta):
  scores_train = [inner(theta,x) for x in X_train]
  scores_validate = [inner(theta,x) for x in X_validate]
  scores_test = [inner(theta,x) for x in X_test]

  predictions_train = [s > 0 for s in scores_train]
  predictions_validate = [s > 0 for s in scores_validate]
  predictions_test = [s > 0 for s in scores_test]

  correct_train = [(a==b) for (a,b) in zip(predictions_train,y_train)]
  correct_validate = [(a==b) for (a,b) in zip(predictions_validate,y_validate)]
  correct_test = [(a==b) for (a,b) in zip(predictions_test,y_test)]
  
  acc_train = sum(correct_train) * 1.0 / len(correct_train)
  acc_validate = sum(correct_validate) * 1.0 / len(correct_validate)
  acc_test = sum(correct_test) * 1.0 / len(correct_test)
  return acc_train, acc_validate, acc_test

for lam in [0, 0.01, 1.0, 100.0]:
  theta = train(lam)
  acc_train, acc_validate, acc_test = performance(theta)
  print("lambda = " + str(lam) + ";\ttrain=" + str(acc_train) + "; validate=" + str(acc_validate) + "; test=" + str(acc_test))

# 2
shuffled = lines[:]
random.shuffle(shuffled)

X = [l[:-1] for l in shuffled]
y = [l[-1] > 5 for l in shuffled]

X_train = X[:int(len(X)/3)]
y_train = y[:int(len(y)/3)]
X_validate = X[int(len(X)/3):int(2*len(X)/3)]
y_validate = y[int(len(y)/3):int(2*len(y)/3)]
X_test = X[int(2*len(X)/3):]
y_test = y[int(2*len(X)/3):]

for lam in [0, 0.01, 1.0, 100.0]:
  theta = train(lam)
  acc_train, acc_validate, acc_test = performance(theta)
  print("lambda = " + str(lam) + ";\ttrain=" + str(acc_train) + "; validate=" + str(acc_validate) + "; test=" + str(acc_test))

X = [l[:-1] for l in lines]
y = [l[-1] > 5 for l in lines]

X_train = X[:int(len(X)/3)]
y_train = y[:int(len(y)/3)]
X_validate = X[int(len(X)/3):int(2*len(X)/3)]
y_validate = y[int(len(y)/3):int(2*len(y)/3)]
X_test = X[int(2*len(X)/3):]
y_test = y[int(2*len(X)/3):]

Reading data...
lambda = 0;	train=0.732843137255; validate=0.720759338641; test=0.77709736681
lambda = 0.01;	train=0.732230392157; validate=0.721984078383; test=0.780159216167
lambda = 1.0;	train=0.726715686275; validate=0.704225352113; test=0.766074709124
lambda = 100.0;	train=0.658700980392; validate=0.630128597673; test=0.696876913656
lambda = 0;	train=0.75; validate=0.757501530925; test=0.738518064911
lambda = 0.01;	train=0.748774509804; validate=0.758113900796; test=0.739742804654
lambda = 1.0;	train=0.731617647059; validate=0.756276791182; test=0.732394366197
lambda = 100.0;	train=0.66237745098; validate=0.681567666871; test=0.680342927128


In [2]:
# 3
lam = 0.01
theta = train(lam)

def ber(theta):
    scores_test = [inner(theta,x) for x in X_test]
    predictions_test = [s > 0 for s in scores_test]
    
    tp, tn, fp, fn = 0.0, 0.0, 0.0, 0.0
    for prediction, label in zip(predictions_test, y_test):
        if prediction and label:
            tp += 1
        elif not prediction and not label:
            tn += 1
        elif prediction and not label:
            fp += 1
        else:
            fn += 1
    fpr, fnr = fp / (tp + fn), fn / (tn + fp)
#     TP = float(sum([(a and b) for (a,b) in zip(predictions_test, y_test)]))
#     TN = float(sum([(not a and not b) for (a,b) in zip(predictions_test, y_test)]))
#     FP = float(sum([(a and not b) for (a,b) in zip(predictions_test, y_test)]))
#     FN = float(sum([(not a and b) for (a,b) in zip(predictions_test, y_test)]))

    return tp, tn, fp, fn, (fpr + fnr) / 2
#     fpr, fnr = FP / (TP + FN), FN / (TN + FP)
#     return TP, TN, FP, FN, (fpr + fnr) / 2

true_pos, true_neg, false_pos, false_neg, balanced_error_rate = ber(theta)
print "True positives", true_pos
print "True negatives", true_neg
print "False positives", false_pos
print "False negatives", false_neg
print "Balanced error rate", balanced_error_rate

# 4
def rank(theta, n):
    scores_test = [inner(theta,x) for x in X_test]
    ranking = sorted(zip(scores_test, y_test), reverse=True)
    relevant_retrieved = len([y for _, y in ranking[:n] if y])
    relevant = len([y for _, y in ranking if y])
    return float(relevant_retrieved) / n, float(relevant_retrieved) / relevant

for predictions in [10, 500, 1000]:
    precision, recall = rank(theta, predictions)
    print "Query size =", predictions, "\tPrecision =", precision, "\tRecall =", recall

# 5
unbiased = numpy.array([row[1:] for row in X_train])
mean = numpy.mean(unbiased, axis=0)
compression = numpy.tile(mean, (len(X_train), 1))
diff = compression - unbiased
print sum([numpy.dot(d, d) for d in diff])

# 6
pca = PCA()
pca.fit(unbiased)
print pca.components_

# 7
print numpy.dot(unbiased[0], pca.components_.T)

# 8
# Explicitly compress and uncompress the data
pca = PCA(n_components=4)
pca.fit(unbiased)
compressed = numpy.dot(unbiased, pca.components_.T)
uncompressed = numpy.dot(compressed, pca.components_)
diff = unbiased - uncompressed
print sum([numpy.dot(d, d) for d in diff])

# Using the definition of reconstruction error
mean = numpy.tile(numpy.mean(unbiased, axis=0), (len(X_train), 1))
diff = unbiased - mean
pca = PCA()
pca.fit(unbiased)
diff_transform = numpy.dot(unbiased, pca.components_[4:].T)
print sum([numpy.dot(d, d) for d in diff_transform])

True positives 1129.0
True negatives 145.0
False positives 321.0
False negatives 38.0
Balanced error rate 0.178304665865
Query size = 10 	Precision = 1.0 	Recall = 0.00856898029135
Query size = 500 	Precision = 0.956 	Recall = 0.409597257926
Query size = 1000 	Precision = 0.864 	Recall = 0.740359897172
3675818.61688
[[ -3.23636346e-04   1.42201752e-04   3.17030713e-04   5.36390435e-02
    9.30284526e-05   2.54030965e-01   9.65655009e-01   3.19990241e-05
   -2.95831396e-04   3.84043646e-04  -1.00526693e-02]
 [ -7.57985623e-03  -1.66366340e-03   1.04742899e-03   5.21677266e-02
    4.49425600e-05   9.65020304e-01  -2.56793964e-01   7.90089050e-06
    5.24900596e-04  -1.09699394e-03  -2.89827657e-03]
 [  1.82124420e-02   2.54680710e-03   3.31838657e-03   9.93221259e-01
   -1.51888372e-04  -6.42297821e-02  -3.91682592e-02   4.30929482e-04
   -6.93199060e-03  -2.85216045e-03  -8.62920933e-02]
 [  1.56811999e-01   3.28220652e-03   1.66866136e-02   8.28549640e-02
   -6.91822288e-03   1.1302968

In [3]:
mean

array([[  7.02365196,   0.28001532,   0.36526348, ...,   3.20603554,
          0.4860723 ,  10.29007353],
       [  7.02365196,   0.28001532,   0.36526348, ...,   3.20603554,
          0.4860723 ,  10.29007353],
       [  7.02365196,   0.28001532,   0.36526348, ...,   3.20603554,
          0.4860723 ,  10.29007353],
       ..., 
       [  7.02365196,   0.28001532,   0.36526348, ...,   3.20603554,
          0.4860723 ,  10.29007353],
       [  7.02365196,   0.28001532,   0.36526348, ...,   3.20603554,
          0.4860723 ,  10.29007353],
       [  7.02365196,   0.28001532,   0.36526348, ...,   3.20603554,
          0.4860723 ,  10.29007353]])