In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

In [2]:
def _generate_vector(shift=0.5, noise=15):
    return np.arange(1000) + (np.random.rand(1000) - shift) * noise

def generate_dataset():
    """
    This dataset is two lines with a slope ~ 1, where one has
    a y offset of ~100
    """
    return np.vstack((
        np.vstack((
            _generate_vector(),
            _generate_vector() + 100,
        )).T,
        np.vstack((
            _generate_vector(),
            _generate_vector(),
        )).T,
    )), np.hstack((np.zeros(1000), np.ones(1000)))

In [3]:
def all_but_first_column(X):
    return X[:, 1:]


def drop_first_component(X, y):
    """
    Create a pipeline with PCA and the column selector and use it to
    transform the dataset.
    """
    pipeline = make_pipeline(
        PCA(), FunctionTransformer(all_but_first_column),
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    pipeline.fit(X_train, y_train)
    return pipeline.transform(X_test), y_test

In [4]:
X, y = generate_dataset()

In [5]:
X

array([[ -2.31502209e+00,   1.05389812e+02],
       [  8.14856719e+00,   1.07815402e+02],
       [  5.48151822e-01,   1.00787574e+02],
       ..., 
       [  9.97885602e+02,   9.97701037e+02],
       [  9.93752711e+02,   9.94688936e+02],
       [  1.00410433e+03,   1.00393475e+03]])

In [18]:
X[:, 1]

array([  105.38981243,   107.8154022 ,   100.78757364, ...,   997.701037  ,
         994.68893607,  1003.93474888])

In [6]:
X_transformed, y_transformed = drop_first_component(*generate_dataset())

In [27]:
pca = PCA(n_components=2)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [28]:
pca.transform(X)

array([[-668.46571629,  -45.62896753],
       [-659.39435099,  -39.87740098],
       [-669.73477269,  -40.35944549],
       ..., 
       [ 669.13579384,   40.64275944],
       [ 664.08957494,   39.8125673 ],
       [ 677.94083042,   40.69785909]])