In [2]:
% matplotlib inline

import time
import pandas as pd

from IPython import display as d
# from IPython import 

import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn import feature_selection as fs
from sklearn import metrics
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import (
    KFold, StratifiedKFold, train_test_split)
from sklearn import preprocessing as prp
import sklearn

from joblib import Parallel, cpu_count, delayed

from libs import container
from libs.experiment import (
    WithAnotherExperiment, KFoldExperiment, roc, 
    discretize_classes, clean_features)

In [9]:
start = time.time()

path = "/home/data/carpyncho/stored/samples/smp_2500_rrVSunk/"
data = container.read(path)

print("Removing b220...")
del data["b220"]

for df in data.values():
    df["AmplitudeJH"] = df["AmplitudeJ"] - df["AmplitudeH"]
    df["AmplitudeJK"] = df["AmplitudeJ"] - df["Amplitude"]

Loading '/home/data/carpyncho/stored/samples/smp_2500_rrVSunk/b261.npy'...
Loading '/home/data/carpyncho/stored/samples/smp_2500_rrVSunk/b263.npy'...
Loading '/home/data/carpyncho/stored/samples/smp_2500_rrVSunk/b220.npy'...
Loading '/home/data/carpyncho/stored/samples/smp_2500_rrVSunk/b264.npy'...
Loading '/home/data/carpyncho/stored/samples/smp_2500_rrVSunk/b262.npy'...
Loading '/home/data/carpyncho/stored/samples/smp_2500_rrVSunk/b278.npy'...
Removing b220...


In [10]:
data, classes, sclasses = discretize_classes(data)
d.display(d.Markdown("**Classes**"))
d.display(classes)

d.display(d.Markdown("----"))
d.display(d.Markdown("**Simplified Classes**"))
d.display(sclasses)
X_columns = clean_features(data, "b278")
X_columns = X_columns.drop("AndersonDarling")
X_columns = X_columns[~(X_columns.str.startswith("Freq2_") | X_columns.str.startswith("Freq3_"))]
print("Total features:", X_columns.size)

**Classes**

{'': 0,
 'Cep-1': 1,
 'Cep-F': 2,
 'LPV-Mira': 3,
 'LPV-OSARG': 4,
 'LPV-SRV': 5,
 'RRLyr-RRab': 6,
 'RRLyr-RRc': 7,
 'RRLyr-RRd': 8,
 'T2Cep-BLHer': 9,
 'T2Cep-RVTau': 10,
 'T2Cep-WVir': 11}

----

**Simplified Classes**

{'': 0, 'Cep': 1, 'LPV': 2, 'RRLyr': 3, 'T2Cep': 4}

Removing ['Gskew', 'Period_fit', 'StetsonK'] because null
Removing ['Freq1_harmonics_rel_phase_0', 'Freq2_harmonics_rel_phase_0', 'Freq3_harmonics_rel_phase_0'] because lowvariance
('Total features:', 55)


In [11]:
tile = container.Container()
tclasses = {}
for idx, item in enumerate(data.items()):
    k, v = item
    tclasses[k] = idx
    tile[k] = v.copy()
    tile[k].loc[:,"tcls"] = k
tile = pd.concat(tile.values())
tile.columns

Index([u'id', u'ogle3_type', u'cnt', u'Amplitude', u'AndersonDarling',
       u'Autocor_length', u'Beyond1Std', u'CAR_mean', u'CAR_sigma', u'CAR_tau',
       u'Con', u'Eta_e', u'FluxPercentileRatioMid20',
       u'FluxPercentileRatioMid35', u'FluxPercentileRatioMid50',
       u'FluxPercentileRatioMid65', u'FluxPercentileRatioMid80',
       u'Freq1_harmonics_amplitude_0', u'Freq1_harmonics_amplitude_1',
       u'Freq1_harmonics_amplitude_2', u'Freq1_harmonics_amplitude_3',
       u'Freq1_harmonics_rel_phase_0', u'Freq1_harmonics_rel_phase_1',
       u'Freq1_harmonics_rel_phase_2', u'Freq1_harmonics_rel_phase_3',
       u'Freq2_harmonics_amplitude_0', u'Freq2_harmonics_amplitude_1',
       u'Freq2_harmonics_amplitude_2', u'Freq2_harmonics_amplitude_3',
       u'Freq2_harmonics_rel_phase_0', u'Freq2_harmonics_rel_phase_1',
       u'Freq2_harmonics_rel_phase_2', u'Freq2_harmonics_rel_phase_3',
       u'Freq3_harmonics_amplitude_0', u'Freq3_harmonics_amplitude_1',
       u'Freq3_harmonics_a

In [19]:
ntile = tile.copy()

In [20]:
scaler = prp.StandardScaler()
ntile[X_columns] = scaler.fit_transform(tile[X_columns])

In [22]:
ntile.head(8)

Unnamed: 0,id,ogle3_type,cnt,Amplitude,AndersonDarling,Autocor_length,Beyond1Std,CAR_mean,CAR_sigma,CAR_tau,...,scls_h,scls_j,scls_k,AmplitudeJH,AmplitudeJK,cls,real_cls,scls,real_scls,tcls
0,32620000299758,T2Cep-WVir,-1.152896,-0.206082,1.0,-0.185372,1.061573,-0.504527,-0.501152,-0.007604,...,-1,-1,-1,-0.206082,-0.206082,11,11,4,4,b262
1,32620000243907,T2Cep-BLHer,-0.052913,-0.444699,1.0,2.768713,0.739498,-0.504304,-0.513117,-0.007604,...,-1,-1,-1,-0.444699,-0.444699,9,9,4,4,b262
2,32620000410013,T2Cep-BLHer,-0.052913,-0.419507,1.0,2.768713,1.383649,-0.504289,-0.514409,-0.007604,...,-1,-1,-1,-0.419507,-0.419507,9,9,4,4,b262
3,32620000644245,RRLyr-RRd,-0.052913,-0.798776,0.939799,2.768713,0.868328,-0.502645,-0.5118,-0.007604,...,-1,-1,-1,-0.798776,-0.798776,8,8,3,3,b262
4,32620000150949,RRLyr-RRd,-0.052913,-0.846359,0.994361,2.768713,1.254819,-0.503532,-0.501692,-0.007604,...,-1,-1,-1,-0.846359,-0.846359,8,8,3,3,b262
5,32620000002448,RRLyr-RRab,-0.088396,-0.595846,0.998126,2.768713,0.972871,-0.503374,-0.513592,-0.007604,...,-1,-1,-1,-0.595846,-0.595846,6,6,3,3,b262
6,32620000002625,RRLyr-RRab,-0.159363,0.702902,1.0,-0.185372,-0.296602,4.474785,1.6324,-0.007604,...,-1,-1,-1,0.702902,0.702902,6,6,3,3,b262
7,32620000003737,LPV-OSARG,-0.052913,1.850503,1.0,-0.185372,-1.515029,1.23855,2.597137,-0.007604,...,-1,-1,-1,1.850503,1.850503,4,4,2,2,b262


In [23]:
tile.head(8)

Unnamed: 0,id,ogle3_type,cnt,Amplitude,AndersonDarling,Autocor_length,Beyond1Std,CAR_mean,CAR_sigma,CAR_tau,...,scls_h,scls_j,scls_k,AmplitudeJH,AmplitudeJK,cls,real_cls,scls,real_scls,tcls
0,32620000299758,T2Cep-WVir,62,0.31025,1.0,1.0,0.322581,4.007449,0.175898,2.779502,...,-1,-1,-1,0.123987,0.13865,11,11,4,4,b262
1,32620000243907,T2Cep-BLHer,124,0.225,1.0,2.0,0.282258,29.474036,-0.308196,0.452656,...,-1,-1,-1,-0.04225,-0.083,9,9,4,4,b262
2,32620000410013,T2Cep-BLHer,124,0.234,1.0,2.0,0.362903,31.224994,-0.360484,0.436031,...,-1,-1,-1,-0.0247,-0.0596,9,9,4,4,b262
3,32620000644245,RRLyr-RRd,124,0.0985,0.939799,2.0,0.298387,219.37929,-0.254907,0.062734,...,-1,-1,-1,-0.288925,-0.4119,8,8,3,3,b262
4,32620000150949,RRLyr-RRd,124,0.0815,0.994361,2.0,0.346774,117.847367,0.15403,0.118285,...,-1,-1,-1,-0.322075,-0.4561,8,8,3,3,b262
5,32620000002448,RRLyr-RRab,122,0.171,0.998126,2.0,0.311475,136.009559,-0.327431,0.108351,...,-1,-1,-1,-0.14755,-0.2234,6,6,3,3,b262
6,32620000002625,RRLyr-RRab,118,0.635,1.0,1.0,0.152542,569891.13844,86.495985,2.5e-05,...,-1,-1,-1,0.75725,0.983,6,6,3,3,b262
7,32620000003737,LPV-OSARG,124,1.045,1.0,1.0,0.0,199500.849759,125.527743,4.9e-05,...,-1,-1,-1,1.55675,2.049,4,4,2,2,b262


In [26]:
to_store = list(X_columns)

In [27]:
to_store

['cnt',
 'Amplitude',
 'Autocor_length',
 'Beyond1Std',
 'CAR_mean',
 'CAR_sigma',
 'CAR_tau',
 'Con',
 'Eta_e',
 'FluxPercentileRatioMid20',
 'FluxPercentileRatioMid35',
 'FluxPercentileRatioMid50',
 'FluxPercentileRatioMid65',
 'FluxPercentileRatioMid80',
 'Freq1_harmonics_amplitude_0',
 'Freq1_harmonics_amplitude_1',
 'Freq1_harmonics_amplitude_2',
 'Freq1_harmonics_amplitude_3',
 'Freq1_harmonics_rel_phase_1',
 'Freq1_harmonics_rel_phase_2',
 'Freq1_harmonics_rel_phase_3',
 'LinearTrend',
 'MaxSlope',
 'Mean',
 'Meanvariance',
 'MedianAbsDev',
 'MedianBRP',
 'PairSlopeTrend',
 'PercentAmplitude',
 'PercentDifferenceFluxPercentile',
 'PeriodLS',
 'Psi_CS',
 'Psi_eta',
 'Q31',
 'Rcs',
 'Skew',
 'SmallKurtosis',
 'Std',
 'c89_jk_color',
 'c89_hk_color',
 'c89_jh_color',
 'n09_jk_color',
 'n09_hk_color',
 'n09_jh_color',
 'c89_m2',
 'c89_m4',
 'c89_c3',
 'n09_m2',
 'n09_m4',
 'n09_c3',
 'AmplitudeH',
 'AmplitudeJ',
 'ppmb',
 'AmplitudeJH',
 'AmplitudeJK']

In [28]:
ntile.columns

Index([u'id', u'ogle3_type', u'cnt', u'Amplitude', u'AndersonDarling',
       u'Autocor_length', u'Beyond1Std', u'CAR_mean', u'CAR_sigma', u'CAR_tau',
       u'Con', u'Eta_e', u'FluxPercentileRatioMid20',
       u'FluxPercentileRatioMid35', u'FluxPercentileRatioMid50',
       u'FluxPercentileRatioMid65', u'FluxPercentileRatioMid80',
       u'Freq1_harmonics_amplitude_0', u'Freq1_harmonics_amplitude_1',
       u'Freq1_harmonics_amplitude_2', u'Freq1_harmonics_amplitude_3',
       u'Freq1_harmonics_rel_phase_0', u'Freq1_harmonics_rel_phase_1',
       u'Freq1_harmonics_rel_phase_2', u'Freq1_harmonics_rel_phase_3',
       u'Freq2_harmonics_amplitude_0', u'Freq2_harmonics_amplitude_1',
       u'Freq2_harmonics_amplitude_2', u'Freq2_harmonics_amplitude_3',
       u'Freq2_harmonics_rel_phase_0', u'Freq2_harmonics_rel_phase_1',
       u'Freq2_harmonics_rel_phase_2', u'Freq2_harmonics_rel_phase_3',
       u'Freq3_harmonics_amplitude_0', u'Freq3_harmonics_amplitude_1',
       u'Freq3_harmonics_a

In [33]:
to_store = ["id"] + to_store + ["ogle3_type", 'scls_h', u'scls_j', u'scls_k', u'cls', u'real_cls', u'scls', u'real_scls', u'tcls']

In [35]:
df = ntile[to_store]

In [36]:
df.to_pickle("data/normalized")

NameError: name 'dt' is not defined