In [1]:
%pylab inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.calibration import CalibratedClassifierCV

# === Parameter tuning - Using Hyperopt ===
from hyperopt import hp, fmin, tpe
from time import time

from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

In [3]:
from sklearn import svm

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train.target

In [6]:
kf = StratifiedKFold(y_train.values, n_folds=5, shuffle=False, random_state=26)

**Multiclass Classification** <br/>
From the doc of sklearn: http://scikit-learn.org/stable/modules/svm.html

    SVC and NuSVC implement the “one-against-one” approach (Knerr et al., 1990) for multi- class classification. If n_class is the number of classes, then n_class * (n_class - 1) / 2 classifiers are constructed and each one trains data from two classes
    
    On the other hand, LinearSVC implements “one-vs-the-rest” multi-class strategy, thus training n_class models. If there are only two classes, only one model is trained
    
SVC can use class weight (but not NuSVC)

#SVC

Tips:

    Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data. For example, scale each attribute on the input vector X to [0,1] or [-1,+1], or standardize it to have mean 0 and variance 1. Note that the same scaling must be applied to the test vector to obtain meaningful results. See section Preprocessing data for more details on scaling and normalization.
    
    Kernel cache size: For SVC, SVR, nuSVC and NuSVR, the size of the kernel cache has a strong impact on run times for larger problems. If you have enough RAM available, it is recommended to set cache_size to a higher value than the default of 200(MB), such as 500(MB) or 1000(MB).
    

In [9]:
def run_svc( X_train, y_train ):
    
    # expect numpy array
    N = y_train.shape[0]
    folds = 5
    kf = StratifiedKFold(y_train, n_folds=folds, shuffle=False)
    
    X = X_train
    y = y_train
    
    train_cv_error = np.empty( folds ) 
    test_cv_error  = np.empty( folds )
    
    params_svc = {'C': 1.0, 'kernel': 'rbf', 'gamma': 0.0, 
                  'probability': True,      # whether to enable proba estimate - increase runtime
                  'shrinking': True,
                  'class_weight': 'auto',   # inversely proportional to class frequencies
                  'random_state': 26,
                  'cache_size': 500,
                  'verbose': True
                 }

    svc_model = svm.SVC(**params_svc)

    for index, (train_index, test_index) in enumerate(kf):
        print "fold ", str(index)
        X_train_cv, X_test_cv = X[train_index], X[test_index]
        y_train_cv, y_test_cv = y[train_index], y[test_index]
    
        s = time()
        svc_fit = svc_model.fit(X_train_cv, y_train_cv)
        train_cv_error[index] = log_loss( y_train_cv, svc_fit.predict_proba(X_train_cv) ) 
        test_cv_error[index]  = log_loss( y_test_cv,  svc_fit.predict_proba(X_test_cv) ) 
        print "train error : ", str(train_cv_error[index])
        print "test error  : ", str(test_cv_error[index]) 
        print "elapsed: {}s \n".format( int( round( time() - s )))

    train_error = np.mean(train_cv_error)
    test_error  = np.mean(test_cv_error)
    return train_error, test_error

In [None]:
run_svc( X_train.values, y_train.values )

fold  0
[LibSVM]

In [8]:
svm.SVC?