# Clasificaciones usando muestra de tamaño 2500

In [2]:
import pickle

import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

import joblib

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

from libs.container import Container
from libs.display import d
from libs.experiment import KFoldExperiment, WithAnotherExperiment, roc, metrics

In [3]:
sample = pd.read_pickle("data/o3o4vZ/scaled/s2_5k.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)

no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", "tile", "cls"] 
X_columns = [c for c in sample.columns if c not in no_features]

grouped = sample.groupby("tile")
data_small = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [4]:
sample = pd.read_pickle("data/o3o4vZ/scaled/s5k.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)

grouped = sample.groupby("tile")
data_mid = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [5]:
sample = pd.read_pickle("data/o3o4vZ/scaled/s20k.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)

grouped = sample.groupby("tile")
data_big = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

## Features

In [6]:
results = {}

In [7]:
cls = {0:0, 1:1}

In [8]:
d(X_columns)

1. Amplitude
2. AmplitudeH
3. AmplitudeJ
4. AmplitudeJH
5. AmplitudeJK
6. AndersonDarling
7. Autocor_length
8. Beyond1Std
9. CAR_mean
10. CAR_sigma
11. CAR_tau
12. Con
13. Eta_e
14. FluxPercentileRatioMid20
15. FluxPercentileRatioMid35
16. FluxPercentileRatioMid50
17. FluxPercentileRatioMid65
18. FluxPercentileRatioMid80
19. Freq1_harmonics_amplitude_0
20. Freq1_harmonics_amplitude_1
21. Freq1_harmonics_amplitude_2
22. Freq1_harmonics_amplitude_3
23. Freq1_harmonics_rel_phase_0
24. Freq1_harmonics_rel_phase_1
25. Freq1_harmonics_rel_phase_2
26. Freq1_harmonics_rel_phase_3
27. LinearTrend
28. MaxSlope
29. Mean
30. Meanvariance
31. MedianAbsDev
32. MedianBRP
33. PairSlopeTrend
34. PercentAmplitude
35. PercentDifferenceFluxPercentile
36. PeriodLS
37. Period_fit
38. Psi_CS
39. Psi_eta
40. Q31
41. Rcs
42. Skew
43. SmallKurtosis
44. Std
45. StetsonK
46. c89_c3
47. c89_hk_color
48. c89_jh_color
49. c89_jk_color
50. c89_m2
51. c89_m4
52. cnt
53. n09_c3
54. n09_hk_color
55. n09_jh_color
56. n09_jk_color
57. n09_m2
58. n09_m4
59. ppmb

In [9]:
def get_metrics(kf, vss):
    
    kfold_prec = metrics.precision_score(kf.y_test, kf.predictions)
    kfold_recall = metrics.recall_score(kf.y_test, kf.predictions)
    
    m = Container(
        kfold=(kfold_prec, kfold_recall), vss=Container())
    
    for vs in vss:
        prec = (
            metrics.precision_score(vs.y_test, vs.predictions))
        recall = (
            metrics.precision_score(vs.y_test, vs.predictions))
        m.vss[vs.test_name] = (prec, recall)
    
    return m


def run(train, data):
    print ">>>> Kfolding {} <<<<".format(train)
    kf = KFoldExperiment(
        clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), clsnum=cls, 
        data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls", verbose=False)
    kf = kf(train, nfolds=10)
    
    print ">>>> Vs {}<<<<".format(train)
    vs = WithAnotherExperiment(
        clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), verbose=False, 
        clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
    vs = vs(train)
    
    return train, get_metrics(kf, vs)

In [11]:
cpu = joblib.cpu_count() -2

In [12]:
with joblib.Parallel(n_jobs=cpu) as jobs:
    result = jobs(
        joblib.delayed(run)(k, data_small)
        for k in sorted(data_small.keys()))
results["small"] = dict(result)

>>>> Kfolding b234 <<<<
>>>> Kfolding b247 <<<<
>>>> Kfolding b248 <<<<
>>>> Kfolding b261 <<<<
>>>> Kfolding b262 <<<<
>>>> Kfolding b263 <<<<
>>>> Kfolding b264 <<<<
>>>> Kfolding b277 <<<<
>>>> Kfolding b278 <<<<
>>>> Vs b247<<<<
>>>> Vs b234<<<<
>>>> Vs b248<<<<
>>>> Vs b261<<<<
>>>> Vs b264<<<<
>>>> Vs b262<<<<
>>>> Vs b263<<<<
>>>> Vs b278<<<<
>>>> Vs b277<<<<


In [13]:
with joblib.Parallel(n_jobs=cpu) as jobs:
    result = jobs(
        joblib.delayed(run)(k, data_mid)
        for k in sorted(data_mid.keys()))
results["mid"] = dict(result)

>>>> Kfolding b234 <<<<
>>>> Kfolding b247 <<<<
>>>> Kfolding b248 <<<<
>>>> Kfolding b261 <<<<
>>>> Kfolding b262 <<<<
>>>> Kfolding b263 <<<<
>>>> Kfolding b264 <<<<
>>>> Kfolding b277 <<<<
>>>> Kfolding b278 <<<<
>>>> Vs b234<<<<
>>>> Vs b247<<<<
>>>> Vs b264<<<<
>>>> Vs b248<<<<
>>>> Vs b261<<<<
>>>> Vs b262<<<<
>>>> Vs b263<<<<
>>>> Vs b278<<<<
>>>> Vs b277<<<<


In [14]:
with joblib.Parallel(n_jobs=cpu) as jobs:
    result = jobs(
        joblib.delayed(run)(k, data_big)
        for k in sorted(data_big.keys()))
results["big"] = dict(result)

>>>> Kfolding b234 <<<<
>>>> Kfolding b247 <<<<
>>>> Kfolding b248 <<<<
>>>> Kfolding b261 <<<<
>>>> Kfolding b262 <<<<
>>>> Kfolding b263 <<<<
>>>> Kfolding b264 <<<<
>>>> Kfolding b277 <<<<
>>>> Kfolding b278 <<<<
>>>> Vs b234<<<<
>>>> Vs b247<<<<
>>>> Vs b264<<<<
>>>> Vs b261<<<<
>>>> Vs b263<<<<
>>>> Vs b248<<<<
>>>> Vs b262<<<<
>>>> Vs b278<<<<
>>>> Vs b277<<<<


In [20]:
np.save("data/o3o4vZ/row_or_column/results.npy", [results])