# Hyper selection For b278 VVV Tile

- **author:** JB Cabral (<jbc.develop@gmail.com>)

In [7]:
% matplotlib inline

from __future__ import print_function

import numpy as np

import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

from sklearn import feature_selection as fs
from sklearn import preprocessing as prp

from skcriteria import Data, MAX, MIN
from skcriteria.madm import topsis, electre
from skcriteria.weights import critic

from libs import fourier_help

pd.options.mode.chained_assignment = None 

TWO_LABELS = {-1: -1, 1: 1, 2: 1, 3: 1}

## Load Data

In [8]:
df = pd.read_hdf("data/features.h5", "b278")
df["period_diff"] = np.abs(df.PeriodLS - df.gatspy_period)
df["cls"] = df["cls"].astype('category')
df["scls"] = df.apply(lambda r: TWO_LABELS[r["cls"]], axis=1).astype("category")

print(len(df.columns) - 1)

73


## 1. Removing Dumplicated and Dirty Data (nan and null)

In [9]:
# removing tff and gatspy
to_remove = (
    fourier_help.fourier_data(df.iloc[0], "kovacs").keys() + 
    ["gatspy_period"])
df = df[df.columns[~df.columns.isin(to_remove)]]

# columns with nan and null
df = df.loc[:, ~df.isnull().any()]

# # sampling
# df = pd.concat([
#     df[df.scls == -1].sample(frac=.010),
#     df[df.scls != -1].sample(frac=.010)])

X_columns = df.columns[~df.columns.isin(["vvv_id", "cls", "scls"])]
X = prp.StandardScaler().fit_transform(df[X_columns].values)
y = df["scls"].values

##  2. Removes all low-variance features

In [10]:
vt = fs.VarianceThreshold()
vt.fit(X, y)
X_columns = X_columns[vt.get_support()]

In [16]:
df = df[["vvv_id"] + list(X_columns) + ["cls", "scls"]]

In [56]:
rr = df[df.scls != -1]
nv = df[df.scls == -1]

In [57]:
mins, maxs = rr[X_columns].min(), rr[X_columns].max()

In [58]:
filters = None
for cname, min, max in  zip(X_columns, mins, maxs):
    if filters is None: 
        filters = nv[cname].between(min, max) 
    else:
        filters &= nv[cname].between(min, max) 

In [62]:
nv[filters]

Unnamed: 0,vvv_id,Amplitude,Rcs,StetsonK,Meanvariance,Autocor_length,Con,Beyond1Std,SmallKurtosis,Std,...,Freq2_harmonics_rel_phase_1,Freq2_harmonics_rel_phase_2,Freq2_harmonics_rel_phase_3,Freq3_harmonics_rel_phase_1,Freq3_harmonics_rel_phase_2,Freq3_harmonics_rel_phase_3,count,period_diff,cls,scls
10,3656402,0.26550,0.348303,0.734192,0.008171,6.0,0.022727,0.239130,2.135342,0.125551,...,0.959184,0.801451,0.248136,-0.241216,-1.067485,-0.463896,46,2.114413e+00,-1,-1
11,3656404,0.24800,0.397548,0.783290,0.008614,12.0,0.000000,0.317460,0.020876,0.133627,...,-2.609178,-1.062283,-1.479529,0.720565,-0.077514,-0.878927,63,5.288353e-01,-1,-1
13,3656427,0.17775,0.403116,0.825855,0.006214,13.0,0.000000,0.396825,-0.163111,0.097418,...,-1.655912,-0.359582,-2.212617,1.862656,-0.256094,1.563426,63,1.024931e+00,-1,-1
18,3656475,0.20150,0.420996,0.848008,0.007541,10.0,0.000000,0.250000,-0.159921,0.120008,...,1.392530,1.218585,-0.000202,1.907587,1.645620,1.075092,44,4.106422e+00,-1,-1
19,3656477,0.33750,0.338173,0.698529,0.010516,7.0,0.018868,0.181818,5.223791,0.156868,...,0.264251,1.126857,0.480651,0.370305,1.678409,1.709728,55,7.043526e+01,-1,-1
32,3656528,0.21350,0.396219,0.811949,0.007151,9.0,0.023256,0.177778,0.487759,0.113110,...,-0.134968,0.024546,0.414922,-0.633967,0.943834,0.355338,45,4.300468e-01,-1,-1
33,3656531,0.17000,0.391825,0.806756,0.005820,11.0,0.000000,0.322034,1.525749,0.088584,...,0.268843,-0.766528,-0.993725,-0.021919,-0.134413,-0.860389,59,3.844184e-06,-1,-1
41,3656559,0.31100,0.390943,0.802603,0.010198,14.0,0.025974,0.189873,2.167208,0.153018,...,1.975235,-0.043698,1.094518,-0.578818,-0.481647,-0.620524,79,1.931590e-01,-1,-1
45,3656571,0.20400,0.360725,0.788655,0.007639,16.0,0.000000,0.254902,8.290803,0.118885,...,-0.726316,-0.159660,-1.430986,0.182688,-0.240992,0.125104,102,1.954473e+01,-1,-1
48,3656578,0.31525,0.335364,0.710258,0.009987,11.0,0.049020,0.105769,4.751998,0.146931,...,0.858995,-0.381545,1.977166,0.099822,-0.716938,-2.431533,104,2.682607e-01,-1,-1
