In [1]:
"""
This notebook is for testing of a basic data science pipeline for incorporation into
the SmartSearch framework.

The iris dataset from sklearn is used for this pipeline.

Pipeline steps:
    1. import data
    2. PCA
    3. Random Forest classification
    
Hyperparameters to be optimized:

    Parameter             Description                        Range
    pca_dim               dimension of PCA                   [0,4]
    number_estimators     number of trees in random forest   [5,30]

"""
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Make iPython notebook take up whole screen

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Print more items in numpy arrays
np.set_printoptions(edgeitems=100)

In [3]:
# Variables for model building
size_test_set = 0.33
pca_dim = 3
number_estimators = 20

In [4]:
# Import data
def load_data():
    data = load_iris()
#     print 'data',data
    X = data['data']
    y = data['target']
    return train_test_split(X,y,test_size=size_test_set) 

X_train,X_test,y_train,y_test = load_data()
print 'X_train',X_train.shape
print 'y_train',y_train.shape
print 'X_test',X_test.shape
print 'y_test',y_test.shape

X_train (100, 4)
y_train (100,)
X_test (50, 4)
y_test (50,)


In [5]:
# PCA

def do_pca(X,num_components,fit,fitted_model=None):
    if fit:
        pca = PCA(n_components = num_components)
        pca = pca.fit(X)
        return pca,pca.transform(X)
    else:
        return fitted_model.transform(X)
    
pca_model,pca_X_train = do_pca(X_train,num_components=pca_dim,fit = True)
print pca_X_train.shape

(100, 3)


In [6]:
def random_forest(X,y,num_estimators):
    clf = RandomForestClassifier(n_estimators=num_estimators)
    clf.fit(X,y)
    print 'accuracy on training set',clf.score(X,y)
    return clf

model = random_forest(pca_X_train,y_train,num_estimators=number_estimators)

accuracy on training set 1.0


In [7]:
### Test model ###

pca_X_test = do_pca(X_test,num_components=pca_dim,fit=False,fitted_model=pca_model)
print model.score(pca_X_test,y_test)
# print model.score(X_test,y_test)

0.9
