In [7]:
from __future__ import print_function

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

import pickle

import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

import joblib

from libs.container import Container
from libs.display import d
from libs.experiment import KFoldExperiment, WithAnotherExperiment, roc

In [2]:
cpu = joblib.cpu_count()

In [3]:
sample = pd.read_pickle("data/scaled/sample.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])

no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", "tile", "cls"] 
X_columns = [c for c in sample.columns if c not in no_features]

grouped = sample.groupby("tile")
data = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})
data = Container(data=pd.concat([data.b278, data.b261]))

cls = {name: idx for idx, name in enumerate(data.data.tile.unique())}
data.data["cls"] = data.data.tile.apply(cls.get)

del grouped, sample

cls

{'b261': 1, 'b278': 0}

In [4]:
cls = {v: v for v in cls.values()}

In [5]:
data.data.columns

Index([u'Amplitude', u'AmplitudeH', u'AmplitudeJ', u'AmplitudeJH',
       u'AmplitudeJK', u'Autocor_length', u'Beyond1Std', u'CAR_mean',
       u'CAR_sigma', u'CAR_tau', u'Con', u'Eta_e', u'FluxPercentileRatioMid20',
       u'FluxPercentileRatioMid35', u'FluxPercentileRatioMid50',
       u'FluxPercentileRatioMid65', u'FluxPercentileRatioMid80',
       u'Freq1_harmonics_amplitude_0', u'Freq1_harmonics_amplitude_1',
       u'Freq1_harmonics_amplitude_2', u'Freq1_harmonics_amplitude_3',
       u'Freq1_harmonics_rel_phase_0', u'Freq1_harmonics_rel_phase_1',
       u'Freq1_harmonics_rel_phase_2', u'Freq1_harmonics_rel_phase_3',
       u'LinearTrend', u'MaxSlope', u'Mean', u'Meanvariance', u'MedianAbsDev',
       u'MedianBRP', u'PairSlopeTrend', u'PercentAmplitude',
       u'PercentDifferenceFluxPercentile', u'PeriodLS', u'Period_fit',
       u'Psi_CS', u'Psi_eta', u'Q31', u'Rcs', u'Skew', u'SmallKurtosis',
       u'Std', u'c89_c3', u'c89_hk_color', u'c89_jh_color', u'c89_jk_color',
       u

In [10]:
RF_PARAMS = {
    'max_features': None, 'min_samples_split': 10, 'n_jobs': cpu, 
    'criterion': 'entropy', 'n_estimators': 500}

In [15]:
X = data.data[X_columns].values
y = data.data.cls.values

In [16]:
%%time
clf = RandomForestClassifier(**RF_PARAMS)
sel = RFECV(clf, n_jobs=cpu, cv=10)
sel.fit(X, y)

CPU times: user 57min 19s, sys: 25.3 s, total: 57min 44s
Wall time: 15min


In [17]:
sel.grid_scores_

array([ 0.79771423,  0.82134513,  0.86900888,  0.87252406,  0.86950129,
        0.87203403,  0.87151888,  0.87051893,  0.86853157,  0.87202903,
        0.8735265 ,  0.87151896,  0.8730366 ,  0.8770618 ,  0.87555419,
        0.87253911,  0.87054162,  0.86852903,  0.86801642,  0.87054167,
        0.8700366 ,  0.87053408,  0.86903155,  0.86902647,  0.87003157,
        0.86601891,  0.87351132,  0.86801893,  0.86752147,  0.86599119,
        0.8695265 ,  0.87001891,  0.87003911,  0.86953157,  0.86450381,
        0.86600378,  0.86651142,  0.86501645,  0.86700632,  0.86651393,
        0.86199122,  0.86500132,  0.8645114 ,  0.8675114 ,  0.86651898,
        0.86699373,  0.86651645,  0.86852398,  0.86299629,  0.86401142,
        0.86551142,  0.86750129,  0.86449119,  0.86701391,  0.86801896,
        0.86501645,  0.86500634])

In [20]:
np.asarray(X_columns)[sel.support_]

array(['Beyond1Std', 'Eta_e', 'Freq1_harmonics_amplitude_0', 'LinearTrend',
       'MaxSlope', 'Mean', 'Meanvariance', 'Psi_eta', 'Rcs', 'c89_m2',
       'cnt', 'n09_c3', 'n09_hk_color', 'n09_m2'], 
      dtype='|S31')

In [26]:
df = pd.DataFrame({"Feature": X_columns, "Score": sel.grid_scores_, "Selected": sel.support_, "Ranking": sel.ranking_})

In [33]:
sel.n_features_

14

In [32]:
df.to_pickle("data/ranking.pkl.bz2", compression="bz2")
df.sort_values("Ranking")

Unnamed: 0,Feature,Ranking,Score,Selected
28,Meanvariance,1,0.867521,True
26,MaxSlope,1,0.873511,True
27,Mean,1,0.868019,True
37,Psi_eta,1,0.865016,True
11,Eta_e,1,0.871519,True
17,Freq1_harmonics_amplitude_0,1,0.868529,True
39,Rcs,1,0.866514,True
47,c89_m2,1,0.868524,True
49,cnt,1,0.864011,True
50,n09_c3,1,0.865511,True
