# **Scikit Learn Tutorial**

In [1]:
#Import necessary modules
import numpy as np
import pandas as pd
import os
import tables
import sklearn as sk
from sklearn import tree, svm, metrics

In [3]:
#import necessary data
datafolder = '/home/relethford/git/practice/python/sklearn/ve/ml/data/nu_anis/2013/'
sig_test = tables.open_file(datafolder+'test_sig.ds.hdf5')
sig_train = tables.open_file(datafolder+'train_sig.ds.hdf5')
bg_test = tables.open_file(datafolder+'test_data.ds.hdf5')
bg_train = tables.open_file(datafolder+'train_data.ds.hdf5')
#print shape of each set
print('sig test data:     {} cols, {} rows'.format(len(sig_test.root.table.colnames),len(sig_test.root.table.cols.zen)))
print('sig training data: {} cols, {} rows'.format(len(sig_train.root.table.colnames),len(sig_train.root.table.cols.zen)))
print('bg test data:      {} cols, {} rows'.format(len(bg_test.root.table.colnames),len(bg_test.root.table.cols.zen)))
print('bg training data:  {} cols, {} rows'.format(len(bg_train.root.table.colnames),len(bg_train.root.table.cols.zen)))

sig test data:     134 cols, 710948 rows
sig training data: 134 cols, 710948 rows
bg test data:      123 cols, 3884472 rows
bg training data:  123 cols, 3884472 rows


In [4]:
#Let's set up a fcn that grabs a given variable from a given dataset, but only out to N events.
N = 10000
def getValue(colName, datafile):
  value = datafile.get_node('/','table').col(colName)[0:N]
  return np.array(list(value))

In [5]:
#Make an array for each of the for data sets, each having the variables we want to investigate.
def varStack(tuple_vars):
    return np.vstack(tuple_vars).T

In [6]:
#For now, we'll take two variables. Lizz says these two (bayes_rat, cog_z) have greatest separation power.
#Add a target for each point - for bg, 0, for sig, 1.
s_test  = varStack((getValue('bayes_rat', sig_test),getValue('cog_z', sig_test),np.ones(N)))
s_train = varStack((getValue('bayes_rat', sig_train),getValue('cog_z', sig_train),np.ones(N)))
b_test   = varStack((getValue('bayes_rat', bg_test),getValue('cog_z', bg_test),np.zeros(N)))
b_train  = varStack((getValue('bayes_rat', bg_train),getValue('cog_z', bg_train),np.zeros(N)))
#Note - THIS is the part where the code stalls like crazy. Either need to figure out how to only access SOME of the data...
#...or else chop it up ahead of time.

In [7]:
print(s_test)
print(np.shape(s_test))

[[  20.08934919  105.95520231    1.        ]
 [  25.3983473    65.84812408    1.        ]
 [  64.54650211 -327.96772998    1.        ]
 ..., 
 [  72.7680737   172.40375962    1.        ]
 [  18.32085279  213.24193715    1.        ]
 [  74.08932732  -86.56194218    1.        ]]
(10000, 3)


In [8]:
#Combine the test and train arrays.
test = np.concatenate((s_test,b_test))
train = np.concatenate((s_train,b_train))

In [9]:
# Create a classifier: a support vector classifier. This is what is suggested in the tutorial for a simple vector classifier.
classifier = svm.SVC(gamma=0.001, probability=True)
#Based on what I know about machine learning, gamma is a learning speed parameter for minimization. The higher it is, the faster the fit converges.

In [10]:
#sklearn takes over from here to fit these features in order to predict which digit they are. We divide into a teaching and target sample.
classifier.fit(train[:,0:2],train[:,-1])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
#Now we can use that fit to classify the expected target for the remaining handwriting samples.
expected = test[:,-1]
predicted = classifier.predict(test[:,0:2])

In [12]:
#It doesn't line up perfectly... but let's see how well it predicts over 10,000.
print(expected[0:10])
print(predicted[0:10])

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
[ 0.  1.  1.  1.  1.  1.  1.  0.  1.  1.]


In [13]:
#Finally, let's measure how well this predicter works.
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

        0.0       0.83      0.91      0.87     10000
        1.0       0.90      0.82      0.85     10000

avg / total       0.86      0.86      0.86     20000


Confusion matrix:
[[9078  922]
 [1847 8153]]


In [14]:
probabilities = classifier.predict_proba(test[:,0:2])

In [15]:
probabilities[0:10]

array([[ 0.72379894,  0.27620106],
       [ 0.07182212,  0.92817788],
       [ 0.01738015,  0.98261985],
       [ 0.32871037,  0.67128963],
       [ 0.21518477,  0.78481523],
       [ 0.1149441 ,  0.8850559 ],
       [ 0.06188867,  0.93811133],
       [ 0.69346043,  0.30653957],
       [ 0.0080759 ,  0.9919241 ],
       [ 0.00671487,  0.99328513]])