## SKlearn Overview
Outline:
* Datasets
* Splitting data into test/train/validation sets
* Learning and predicting
* Parameter tuning
* Model persistence

### Loading builtin dataset

In [1]:
# Provides toy datasets
from sklearn import datasets
# Load the iris dataset for classification
iris = datasets.load_iris()
# load the digits dataset for classification
digits = datasets.load_digits()
# load boston housing price for regression
boston = datasets.load_boston()
# load diabetes dataset for regression
diabetes = datasets.load_diabetes()

### Understanding dataset
Check dataset object.<tab> to see various members

In [2]:
print("iris feature names: {}".format(iris.feature_names))
print("data type: {}".format(type(iris.data)))
print(iris.data[:10])
print(iris.target[:10])

iris feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
data type: <class 'numpy.ndarray'>
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]]
[0 0 0 0 0 0 0 0 0 0]


### train_test_split
Splitting data into validation, testing and training samples

In [3]:
from sklearn.model_selection import train_test_split
X = iris.data[:, :2]
y = iris.target
# 20% of data as testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
# split 100 of TRAINING data as validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=100, random_state=4)

### make classification dataset

In [4]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=20)
print(X.shape)
X[:5]

(100, 20)


array([[ 2.23726495, -1.42694126, -1.13635742, -0.11537417,  0.473066  ,
        -1.48315945, -0.7857967 ,  0.22322429, -0.36297705,  0.32656592,
         0.50851831, -0.52467883,  0.79906067,  0.16989665, -0.12470124,
         0.94293164, -0.69146619,  0.22574145, -0.38134688, -0.15187249],
       [-1.17733603,  2.26521396,  2.16427811,  0.6082616 , -1.15744382,
        -0.7847868 ,  0.74406337, -0.22209533, -0.02985833,  0.50763087,
         0.2617405 ,  2.4095089 , -0.2831095 , -0.12451756,  0.17816058,
        -1.30873356,  1.35069157,  0.42397519,  0.49397065,  0.13951352],
       [ 1.86981387,  1.45532944,  0.83273762, -1.81056441,  1.22342838,
        -1.04625454,  1.360467  ,  0.56656878, -2.41948547, -0.02484754,
        -0.67810678, -0.86777774,  1.40228871,  0.09922184,  0.25624359,
        -1.13200721,  0.11634595,  1.2644228 ,  0.1573534 ,  0.24685201],
       [ 1.32072937,  2.52199323,  0.47945114, -0.23801398,  0.83387694,
        -0.43956229, -0.31523718,  1.09958603, -

### Train a SVM classifier

In [5]:
# Import the classifier
from sklearn import svm
# C is a hyper-parameter
clf = svm.SVC(C=10)
# Training a classifier
clf.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [6]:
# Predict on the validation set to see accuracy
import numpy as np
predictions = clf.predict(X_valid)
print('validation accuracy = {}'.format(np.sum(predictions == y_valid)/len(y_valid)))

validation accuracy = 0.72


In [7]:
predictions = clf.predict(X_test)
print ('test accuracy = {}'.format(np.sum(predictions == y_test)/len(y_test)))

test accuracy = 0.6666666666666666


### Model Parameter tuning

In [8]:
# How to select the best value of C?
# See the value of C that gives best accuracy on validation data
best_acc = 0.0
best_C = 0.0
step_size = 1.0
C = 1.0
while C < 20.0:
    clf = svm.SVC(C=C)
    clf.fit(X_train, y_train)
    accuracy = np.sum(clf.predict(X_valid)==y_valid)/len(y_valid)
    print ('Accuracy at C = ' + str(C) + ' is ' + str(accuracy))
    if (accuracy > best_acc):
        best_acc = accuracy
        best_C = C
    C += step_size
print ('Best C = ' + str(best_C) + '. It has an accuracy of ' + str(best_acc))

clf = svm.SVC(C=best_C)
# after tuning parameter, we want use whole data available to train the model for better accuracy
X_train_valid = np.concatenate((X_train,X_valid))
y_train_valid = np.concatenate((y_train,y_valid))
clf.fit(X_train_valid, y_train_valid)
predictions = clf.predict(X_test)
print ('final test accuracy = {}'.format(np.sum(predictions == y_test)/len(y_test)))

Accuracy at C = 1.0 is 0.79
Accuracy at C = 2.0 is 0.73
Accuracy at C = 3.0 is 0.72
Accuracy at C = 4.0 is 0.7
Accuracy at C = 5.0 is 0.69
Accuracy at C = 6.0 is 0.72
Accuracy at C = 7.0 is 0.72
Accuracy at C = 8.0 is 0.72
Accuracy at C = 9.0 is 0.73
Accuracy at C = 10.0 is 0.72
Accuracy at C = 11.0 is 0.7
Accuracy at C = 12.0 is 0.7
Accuracy at C = 13.0 is 0.69
Accuracy at C = 14.0 is 0.69
Accuracy at C = 15.0 is 0.65
Accuracy at C = 16.0 is 0.64
Accuracy at C = 17.0 is 0.64
Accuracy at C = 18.0 is 0.64
Accuracy at C = 19.0 is 0.64
Best C = 1.0. It has an accuracy of 0.79
final test accuracy = 0.8333333333333334


### GridSearch: auto search best parameters

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

parameters = { 'C':[i for i in range(1,20)] }
estimator = svm.SVC()
clf = GridSearchCV(estimator, parameters, verbose=True, n_jobs=-1)
clf.fit(X_train_valid, y_train_valid)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print ('Best C:{}, final test accuracy = {}'.format(clf.best_params_, accuracy))

Fitting 3 folds for each of 19 candidates, totalling 57 fits
Best C:{'C': 1}, final test accuracy = 0.8333333333333334


[Parallel(n_jobs=-1)]: Done  57 out of  57 | elapsed:    1.5s finished


### Model persistence

In [10]:
# It is possible to save a model in the scikit by using Python’s built-in persistence model, namely pickle 
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
pred = clf2.predict(X[0:1])
print (pred)

[0]


In the specific case of the scikit, it may be more interesting to use joblib’s replacement of pickle (joblib.dump & joblib.load), which is more efficient on big data

In [12]:
from sklearn.externals import joblib
joblib.dump(clf, 'filename.pkl')  #.pkl means a pickle file

['filename.pkl']

In [13]:
clf = joblib.load('filename.pkl') 

Other type of models such as regressors, clustering mechansims etc. will be discussed later. This module was only to give a brief overview of the capabilities of sklearn