# Manifold Learning - How to improve Classification

Toy example how to make use of manifold learning / dimensionality reduction for feature engineering

In [1]:
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
%matplotlib inline
from sklearn import manifold, datasets
from sklearn import random_projection
from sklearn.decomposition import PCA

from sklearn import neighbors, linear_model

Automatically created module for IPython interactive environment


In [41]:
digits = datasets.load_digits(n_class=2)
X_digits = digits.data
y_digits = digits.target
n_samples, n_features = X_digits.shape

X_train = X_digits[:int(.9 * n_samples)]
y_train = y_digits[:int(.9 * n_samples)]
X_test = X_digits[int(.9 * n_samples):]
y_test = y_digits[int(.9 * n_samples):]

# 64 features
knn = neighbors.KNeighborsClassifier()
logistic = linear_model.LogisticRegression()

print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression score: %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))

KNN score: 1.000000
LogisticRegression score: 1.000000


# Try out different manifold-learned features

In [39]:
n_additional_features = 22
#transformer = random_projection.SparseRandomProjection(n_additional_features, random_state=42)
#transformer = random_projection.GaussianRandomProjection(n_additional_features, random_state=0)
#transformer = manifold.MDS(n_additional_features, max_iter=300, n_init=3)
transformer = manifold.TSNE(n_components=3, init='pca', random_state=42)
#transformer = PCA(n_additional_features)

In [40]:
New_features_train = transformer.fit_transform(X_train)
X_train2 = np.c_[X_train, New_features_train]

# Calculate transformation for test data by hand (there is no fit function for the transformation)
Handmade_fit_transform = np.linalg.lstsq(X_train, New_features_train, rcond=None)
Transformation_matrix = Handmade_fit_transform[0]

# Calculate new features 
X_test2 = np.c_[X_test, np.matmul(X_test, Transformation_matrix)]

print('KNN score: %f' % knn.fit(X_train2, y_train).score(X_test2, y_test))
print('LogisticRegression score: %f'
      % logistic.fit(X_train2, y_train).score(X_test2, y_test))

KNN score: 0.944444
LogisticRegression score: 0.916667


In [37]:
print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression score: %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))

KNN score: 0.963889
LogisticRegression score: 0.897222


## Now let's try not to cheat - getting rid of fit_transform

In [None]:
a = X_train
b = Y_digits_train
#x = np.linalg.lstsq(a, b, rcond=None)
# Calculate transformation matrix from training data only
x = np.matmul(np.linalg.pinv(a),b)
x.shape

In [None]:
b.shape

In [None]:
x[0]

In [None]:
digits2 = datasets.load_digits(n_class=6)
#X_digits2 = digits2.data
#y_digits2 = digits2.target

Y_digits = transformer.fit_transform(X_digits)
X_digits2 = np.c_[X_digits, Y_digits]
#X_digits2 = Y_digits
#X_digits2 = X_digits2[:,64:66]

X_train2 = X_digits2[:int(.9 * n_samples)]
y_train2 = y_digits[:int(.9 * n_samples)]
X_test2 = X_digits2[int(.9 * n_samples):]
y_test2 = y_digits[int(.9 * n_samples):]

print('KNN score: %f' % knn.fit(X_train2, y_train2).score(X_test2, y_test2))
print('LogisticRegression score: %f'
      % logistic.fit(X_train2, y_train2).score(X_test2, y_test2))

In [None]:
a = np.array([[3,1,2],[2,1,2]])
b = np.array([9,8])
x = np.linalg.lstsq(a, b)
x

In [4]:
Add_features = transformer.fit_transform(X_train)
Add_features

array([[  7.535404 , -30.34967  ,  -8.010539 ],
       [  3.6932597,  21.434452 ,  15.182379 ],
       [ -0.8953111,  13.490789 ,   8.821477 ],
       ...,
       [-14.937363 ,   5.912628 ,  10.598797 ],
       [  4.0065293,  27.213686 ,  13.419299 ],
       [  1.5898446,   6.8484554, -23.656971 ]], dtype=float32)

In [6]:
X_test2.shape

(64, 3)

In [8]:
np.linalg.pinv(X_train).shape

(64, 974)

In [11]:
Handmade_fit_transform = np.linalg.lstsq(X_train, Add_features,rcond=None)

In [15]:
Transformation_Matrix = Handmade_fit_transform[0]

In [None]:
Add_features_test = np.matmul(X_test,Transformation_Matrix)

In [23]:
New_features_train

array([[ 11.244668 , -38.748028 ],
       [  8.261975 ,  32.655094 ],
       [ -4.286929 ,  29.210676 ],
       ...,
       [-22.301434 ,   1.9374135],
       [  5.9737096,  38.21313  ],
       [  2.414325 ,   1.5860955]], dtype=float32)