In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import os
from sklearn import datasets, cross_validation, metrics
from classifier import KernelNullSpaceClassifier

In [2]:
PATH = r"/home/yen/Projects/masters/Datasets/Sample/"

In [3]:
ds = datasets.load_files(PATH)
ds.data = np.vstack([np.fromstring(txt, sep='\t') for txt in ds.data])

idx = ds.target.argsort()
ds.data = ds.data[:, idx]
ds.target = ds.target[idx]


In [4]:
# Train novelty classifier
# Randomly select a few classes to represent the "novel" class.
data = ds.data
target = ds.target

classes = np.unique(target)
num_class = len(classes)
num_novel = max(np.floor(num_class*0.2), 1) # 20% of classes will be "novel"
novel = np.random.choice(target, num_novel)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(\
    data, target, test_size=0.4, random_state=0)

# Remove all instances of the novel classes from the training set
mask = np.array([y not in novel for y in y_train])
X_train = X_train[mask]
y_train = y_train[mask]

# Test labels are 1 if novel, otherwise 0.
y_test = np.array([1 if cl in novel else 0 for cl in y_test])

X_train = data
y_train = target

In [5]:
classifier = KernelNullSpaceClassifier(threshold=0.5).fit(X_train, y_train)
y_score = classifier.predict(X_test)
y_score

K: [[ 1273.   398.   645. ...,   748.   577.   423.]
 [  398.  1513.   885. ...,   813.   737.   747.]
 [  645.   885.  5406. ...,  1936.  1279.  1449.]
 ..., 
 [  748.   813.  1936. ...,  4655.  1562.  1305.]
 [  577.   737.  1279. ...,  1562.  3520.   816.]
 [  423.   747.  1449. ...,  1305.   816.  3707.]]
[[ 1120.29615294   124.35746372  -134.36980901 ...,   -28.58968216
   -134.06536926    24.11644892]
 [  124.35746372  1118.4187745    -15.30849823 ...,   -84.52837138
    -95.00405848   227.1777597 ]
 [ -134.36980901   -15.30849823  3999.96422904 ...,   532.74435589
    -58.73133121   423.45048697]
 ..., 
 [  -28.58968216   -84.52837138   532.74435589 ...,  3254.52448274
    227.04879564   282.23061382]
 [ -134.06536926   -95.00405848   -58.73133121 ...,   227.04879564
   2250.57310854  -141.24507328]
 [   24.11644892   227.1777597    423.45048697 ...,   282.23061382
   -141.24507328  3061.9367449 ]]
L: [[ 0.01020408  0.01020408  0.01020408 ...,  0.          0.          0.        

array([ 53.74223838,  28.04524777,  30.4065179 ,  42.54026489,
        38.68657217,  28.97138146,  86.19410689,  27.63252927,
        35.81310795,  34.35922483,  26.24480755,  27.4787461 ,
        24.48102399,  26.88209565,  24.22880149,  55.527062  ,
        70.47366745,  37.78097628,  46.59755188,  32.18761361,
        34.36309063,  52.2178394 ,  28.15926288,  25.27330616,
        45.27948152,  39.34874212,  50.18157361,  43.37241439,
        46.0404224 ,  45.83766307,  36.67264927,  42.63100009,
        26.0833461 ,  48.77559065,  31.95540694,  47.04333787,
        34.37770288,  37.86584186,  32.48687121,  32.90408454,
        30.88193654,  30.81421518,  48.67243217,  33.20261762,
        31.01471219,  51.67209366,  52.13760415,  36.68775099,
        27.17601741,  47.07352213,  70.97674206,  51.82167421,
        35.09395734,  39.01246148,  28.92434191,  31.9026827 ,
        27.71796473,  28.95026504,  25.32735079,  63.73040007,
        26.11680084,  38.25626337,  68.87604037,  39.10

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score)

In [None]:
fpr

In [None]:
thresholds

In [None]:
from itertools import product
x = [1,5,7,9]
for pt1, pt2 in product(x, x):
    if pt1 != pt2:
        print(pt1, pt2)

In [None]:
X_train.shape

In [None]:
a = np.array([[1,5,2],[3,2,6],[7,3,4]])
b = np.array([1,5,10])
a[b>5]

In [None]:
from itertools import product
labels = np.array([2,2,2,2,5,2,4])
classes = np.array([2,4,5])
L = np.zeros([7,7])
for cl in classes:
    for idx1, x in enumerate(labels == cl):
        for idx2, y in enumerate(labels == cl):
            if x and y:
                L[idx1, idx2] = 1.0/np.sum(labels==cl)
    

print(L)


In [None]:
labels == cl
L[labels==cl]