# Dealing with dimensions in classification

## Running example

<table>
    <tr>
        <th></th>
        <th>DocID</th>
        <th>Tokens</th>
        <th>E-devices</th>
    </tr>
    <tr>
        <td>Training</td>
        <td>0</td>
        <td>apple ios mac book fruit</td>
        <td>TRUE</td>
    </tr>
    <tr>
        <td>Training</td>
        <td>1</td>
        <td>apple mac book apple store fruit</td>
        <td>TRUE</td>
    </tr>
    <tr>
        <td>Training</td>
        <td>2</td>
        <td>microsoft ibm apple oracle</td>
        <td>TRUE</td>
    </tr>
    <tr>
        <td>Training</td>
        <td>3</td>
        <td>apple banana mango fruit</td>
        <td>FALSE</td>
    </tr>
    <tr>
        <td>Training</td>
        <td>4</td>
        <td>apple fruit</td>
        <td>FALSE</td>
    </tr>
    <tr style='border-top: double 2px #000000'>
        <td style='background-color: #eeee00'>Test</td>
        <td style='background-color: #eeee00'>5</td>
        <td style='background-color: #eeee00'>apple mac book fruit</td>
        <td style='background-color: #eeee00'>?</td>
    </tr>
    <tr>
        <td style='background-color: #eeee00'>Test</td>
        <td style='background-color: #eeee00'>6</td>
        <td style='background-color: #eeee00'>mac fruit color</td>
        <td style='background-color: #eeee00'>?</td>
    </tr>
</table>


In [None]:
from collections import Counter
import numpy as np
from IPython.core.display import display, HTML


def to_table(tokens, M, rnd=3):
    row = "".join(['<th>'+x+'</th>' for x in tokens])
    table = "<table><tr>"+row+"</tr>"
    for doc in M:
        row = "<tr>"+"".join(['<td>'+str(round(x, rnd))+'</td>' for x in doc])+"</tr>"
        table += row
    table += '</table>'
    display(HTML(table)) 

def model(tokens, docs):
    dictionary, tf = {}, {}
    for i, d in enumerate(docs):
        tf[i] = dict(Counter(d).most_common())
        for k in tf[i].keys():
            try:
                dictionary[k] += 1
            except KeyError:
                dictionary[k] = 1
    M = np.zeros((len(docs), len(tokens)))
    for i in range(0, len(docs)):
        for j, k in enumerate(tokens):
            try:
                M[i,j] = float(tf[i][k]) / (np.log((float(len(docs)) / dictionary[k])) + 1)
            except KeyError:
                pass
        M[i] /= np.linalg.norm(M[i])
    return M


In [None]:
training = [
    ['apple', 'ios', 'mac', 'book', 'fruit'],
    ['apple', 'mac', 'book', 'apple', 'store', 'fruit'],
    ['microsoft', 'ibm', 'apple', 'oracle'],
    ['apple', 'banana', 'mango', 'fruit'],
    ['apple', 'fruit']
]
testing = [
    ['apple', 'mac', 'book', 'fruit'],
    ['mac', 'fruit', 'color']
]
tokens = set()
for d in training + testing:
    for t in d:
        tokens.add(t)
tokens = sorted(list(tokens))

T = model(tokens, training)
X = model(tokens, testing)

In [None]:
to_table(tokens, T)

In [None]:
to_table(tokens, X)

# Rocchio

In [None]:
from scipy.spatial import distance

def rocchio(c, nc, doc, df=None):
    c_m, nc_m = np.mean(c, axis=0), np.mean(nc, axis=0)
    if df is None:
        df = distance.euclidean
    s, b = df(doc, c_m), df(doc, nc_m)
    if  s < b:
        return True, s-b
    else:
        return False, s-b

In [None]:
for i in range(0, len(testing)):
    R = rocchio(T[:3,:], T[3:,:], X[i], df=distance.euclidean)
    print " ".join(testing[i]), R

In [None]:
display(HTML('<h2>class</h2>'))
to_table(tokens, T[:3,:])
display(HTML('<h3>centroid</h3>'))
to_table(tokens, np.array([np.mean(T[:3,:], axis=0)]))
display(HTML('<h2>no class</h2>'))
to_table(tokens, T[3:,:])
display(HTML('<h3>centroid</h3>'))
to_table(tokens, np.array([np.mean(T[3:,:], axis=0)]))

## Understand terminology

In [None]:
import matplotlib.pyplot as plt


def visualize(ax, D, labels, d1=0, d2=1, marker='o', d1l='d1', d2l='d2'):
    for d in range(0, D.shape[0]):
        ax.scatter(D[d, d1], D[d, d2], label=labels[d], marker=marker)
        ax.set_xlabel(d1l)
        ax.set_ylabel(d2l)

cmean, ncmean = np.mean(T[:3,:], axis=0), np.mean(T[3:,:], axis=0)
means = np.vstack((np.array([cmean]), np.array([ncmean])))
cols = 6
c_tokens = []
for ki, kk in enumerate(tokens):
    for kj in tokens[ki+1:]:
        c_tokens.append((kk, kj))
k = len(c_tokens)
fig, axes = plt.subplots(nrows=int(np.ceil(float(k)/cols)), ncols=cols, figsize=(14, 20))
for cl in range(0, k):
    visualize(axes[cl/cols, cl%cols], T, [str(x) for x in range(0, T.shape[0])], 
              d1=tokens.index(c_tokens[cl][0]), d2=tokens.index(c_tokens[cl][1]), 
              d1l=c_tokens[cl][0], d2l=c_tokens[cl][1])
    visualize(axes[cl/cols, cl%cols], means, ['c', 'nc'], 
              d1=tokens.index(c_tokens[cl][0]), d2=tokens.index(c_tokens[cl][1]), marker='x',
              d1l=c_tokens[cl][0], d2l=c_tokens[cl][1])
    visualize(axes[cl/cols, cl%cols], X, [str(x) for x in range(0, X.shape[0])], 
              d1=tokens.index(c_tokens[cl][0]), d2=tokens.index(c_tokens[cl][1]), marker='^',
              d1l=c_tokens[cl][0], d2l=c_tokens[cl][1])
plt.tight_layout()
axes[cl/cols, cl%cols].legend()
plt.show()


### Measuring utility

In [None]:
filtered, th = [], 0.01
for i, t in enumerate(tokens):
    v = np.var(T.T[i])
    print t, v
    if v >= th:
        filtered.append(t)
print filtered

In [None]:
Tf = model(filtered, training)
Xf = model(filtered, testing)

In [None]:
for i in range(0, len(testing)):
    R = rocchio(Tf[:3,:], Tf[3:,:], Xf[i], df=distance.euclidean)
    print " ".join(testing[i]), R

### Mutual utility

In [None]:
C = np.cov(T.T, ddof=0)

In [None]:
test = 'mac'
for i, t in enumerate(tokens):
    print test, t, C[tokens.index(test), i]

In [None]:
clusters, th = {}, 0.01
for i, t in enumerate(tokens):
    clusters[t] = []
    for k in tokens[i:]:
        cv = C[i, tokens.index(k)]
        if cv > th:
            clusters[t].append(k)
sets = []
for o in [x for x in clusters.values() if len(x) > 0]:
    sets.append(set(o))
sets = sorted(sets, key=lambda x: -len(x))
for s in sets:
    print s

In [None]:
refined = []
for s in sets:
    ins = True
    for i, k in enumerate(refined):
        if len(s.intersection(k)) > 0:
            refined[i] = s.union(k)
            ins = False
            break
    if ins:
        refined.append(s)
for s in refined:
    print s

### Use new terminology

In [None]:
training_r = []
testing_r = []
for doc in training:
    n_doc = []
    for token in doc:
        for i, s in enumerate(refined):
            if token in s:
                n_doc.append(str(i))
                break
    training_r.append(n_doc)
for doc in testing:
    n_doc = []
    for token in doc:
        for i, s in enumerate(refined):
            if token in s:
                n_doc.append(str(i))
                break
    testing_r.append(n_doc)


In [None]:
tokens_r = set()
for d in training_r + testing_r:
    for t in d:
        tokens_r.add(t)
tokens_r = sorted(list(tokens_r))

Tr = model(tokens_r, training_r)
Xr = model(tokens_r, testing_r)

In [None]:
to_table(tokens_r, Tr)

In [None]:
to_table(tokens_r, Xr)

In [None]:
for i in range(0, len(testing_r)):
    R = rocchio(Tr[:3,:], Tr[3:,:], Xr[i], df=distance.euclidean)
    print " ".join(testing_r[i]), R

In [None]:
means = np.vstack((np.array([np.mean(Tr[:3,:], axis=0)]), np.array([Tr[3,:]])))
cols = 3
cr_tokens = []
for ki, kk in enumerate(tokens_r):
    for kj in tokens_r[ki+1:]:
        cr_tokens.append((kk, kj))
k = len(cr_tokens)
fig, axes = plt.subplots(nrows=int(np.ceil(float(k)/cols)), ncols=cols, figsize=(14, 8))
for cl in range(0, k):
    visualize(axes[cl/cols, cl%cols], Tr, [str(x) for x in range(0, Tr.shape[0])], 
              d1=tokens_r.index(cr_tokens[cl][0]), d2=tokens_r.index(cr_tokens[cl][1]), 
              d1l=cr_tokens[cl][0], d2l=cr_tokens[cl][1])
    visualize(axes[cl/cols, cl%cols], means, ['c', 'nc'], 
              d1=tokens_r.index(cr_tokens[cl][0]), d2=tokens_r.index(cr_tokens[cl][1]), 
              marker='x',
              d1l=cr_tokens[cl][0], d2l=cr_tokens[cl][1])
    visualize(axes[cl/cols, cl%cols], Xr, [str(x) for x in range(0, Xr.shape[0])], 
              d1=tokens_r.index(cr_tokens[cl][0]), d2=tokens_r.index(cr_tokens[cl][1]), 
              marker='^',
              d1l=cr_tokens[cl][0], d2l=cr_tokens[cl][1])
plt.tight_layout()
axes[cl/cols, cl%cols].legend()
plt.show()



## Better dimensionality reduction

In [None]:
w, v = np.linalg.eig(C)

v : (..., M, M) array

The normalized (unit “length”) eigenvectors, such that the column v[:,i] is the eigenvector corresponding to the eigenvalue w[i].

In [None]:
print w[4]
print v[:,4]

### Sort the eigenvalues

In [None]:
sw = sorted([(x, i) for i, x in enumerate(w)], key=lambda k: -k[0])
dimensions = 4
Ct = np.zeros((C.shape[0], dimensions))
features = [v[:, x[1]] for x in sw[:dimensions]]  
Fm = np.array(features)

In [None]:
Tt = np.dot(Fm, T.T).T
Xt = np.dot(Fm, X.T).T
cmean, ncmean = np.mean(Tt[:3,:], axis=0), np.mean(Tt[3:,:], axis=0)
means = np.vstack((np.array([cmean]), np.array([ncmean])))

In [None]:
print Tt.shape
print Tt

In [None]:
for i in range(0, len(testing)):
    R = rocchio(Tt[:3,:], Tt[3:,:], Xt[i], df=distance.euclidean)
    print " ".join(testing[i]), R

In [None]:
for test in Xt:
    print distance.euclidean(test, cmean), distance.euclidean(test, ncmean)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
visualize(axes, Tt, [str(x) for x in range(0, Tt.shape[0])])
visualize(axes, Xt, [str(x) for x in range(0, Xt.shape[0])], marker='^')
visualize(axes, means, [str(x) for x in range(0, means.shape[0])], marker='x')
plt.tight_layout()
axes.legend()
plt.show()