In [1]:
from __future__ import print_function

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

import pickle

import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

import joblib

from libs.container import Container
from libs.display import d
from libs.experiment import KFoldExperiment, WithAnotherExperiment, roc

In [2]:
cpu = joblib.cpu_count()

In [3]:
cpu

16

In [4]:
sample = pd.read_pickle("data/scaled/sample.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])

no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", "tile", "cls"] 
X_columns = [c for c in sample.columns if c not in no_features]

grouped = sample.groupby("tile")
data = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [5]:
df = pd.concat([data.b278, data.b261])

cls = {name: idx for idx, name in enumerate(df.tile.unique())}
df["cls"] = df.tile.apply(cls.get)

print(cls)

{'b261': 1, 'b278': 0}


In [6]:
X = df[X_columns].values
y = df.cls.values

In [7]:
%%time
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5, n_jobs=cpu,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}

Grid scores on development set:

0.629 (+/-0.058) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.525 (+/-0.188) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.686 (+/-0.022) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.652 (+/-0.053) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.711 (+/-0.021) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.676 (+/-0.022) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.734 (+/-0.022) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.694 (+/-0.034) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0.680 (+/-0.036) for {'kernel': 'linear', 'C': 1}
0.692 (+/-0.045) for {'kernel': 'linear', 'C': 10}
0.696 (+/-0.045) for {'kernel': 'linear', 'C': 100}
0.693 (+/-0.054) for {'kernel': 'linear', 'C': 1000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed

In [8]:
list(range(5, 15)) + [np.sqrt(len(X_columns))]

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 7.5498344352707498]

In [9]:
%%time
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [
    {'max_features': ['auto', 'sqrt', "log2", None, 0.2, 0.5], 
     "min_samples_split": [2, 5, 10],
     "n_estimators": [500], 
     "criterion": ["entropy"], 
     "n_jobs": [10]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, n_jobs=cpu,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'max_features': 0.5, 'min_samples_split': 5, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 500}

Grid scores on development set:

0.805 (+/-0.030) for {'max_features': 'auto', 'min_samples_split': 2, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 500}
0.810 (+/-0.044) for {'max_features': 'auto', 'min_samples_split': 5, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 500}
0.806 (+/-0.025) for {'max_features': 'auto', 'min_samples_split': 10, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 500}
0.815 (+/-0.032) for {'max_features': 'sqrt', 'min_samples_split': 2, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 500}
0.804 (+/-0.038) for {'max_features': 'sqrt', 'min_samples_split': 5, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 500}
0.810 (+/-0.040) for {'max_features': 'sqrt', 'min_samples_split': 10, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 50

In [10]:
# Random Forest 
RF_prec = {'max_features': None, 'min_samples_split': 10, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 500}
RF_recall = {'max_features': None, 'min_samples_split': 10, 'n_jobs': 10, 'criterion': 'entropy', 'n_estimators': 500}

RF_prec == RF_recall

True

In [11]:
SVM_prec = {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
SVM_recall = {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}

SVM_recall == SVM_prec

True