# Entreno en ogle-3 y veo que detecte en ogle 4

In [2]:
import pickle

import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

from libs.container import Container
from libs.display import d
from libs.experiment import KFoldExperiment, WithAnotherExperiment, roc

In [4]:
all_cat_types = pd.read_pickle(
    "data/ogle3_only/all_cat_types.pkl")

def o4_types(i):
    t = all_cat_types[all_cat_types.id == i].vs_type.values[0]
    if t == "":
        return 0
    if t.startswith("RRLyr-"):
        return 1
    return 2

class ResultCollector(object):
    
    def __init__(self):
        self.r = {}
    
    def prop_fp(self, result, size, train, test, force=False):
        key = size, train, test
        
        if force or key not in self.r:

            df = pd.DataFrame({
                "o3_cls": result.y_test_real,
                "o3_pred": result.predictions,
                "id": result.ids})

            total = len(df)

            df = df[(df.o3_pred == 1) & (df.o3_cls == 0)] 

            df["o4_cls"] = df.id.apply(o4_types)
            vstars = df[df.o4_cls != 0]
            rr = df[df.o4_cls == 1]

            prop_vstars = len(vstars) / float(total)
            prop_rrlyrae = len(rr) / float(total)
            self.r[key] = prop_vstars, prop_rrlyrae
        
        return self.r[key]

collector = ResultCollector()

In [3]:
sample = pd.read_pickle("data/ogle3_only/scaled/s20k.pkl")
sample["o4_type"] = sample.id.apply(lambda i: o4_types(i))
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)


no_features = [
    "id", "vs_catalog", "vs_type", 
    "ra_k", "dec_k", "tile", "cls", "o4_type"] 
X_columns = [c for c in sample.columns if c not in no_features]

grouped = sample.groupby("tile")
data_big = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [4]:
sample = pd.read_pickle("data/ogle3_only/scaled/s5k.pkl")
sample["o4_type"] = sample.id.apply(lambda i: o4_types(i))
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)

grouped = sample.groupby("tile")
data_mid = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [5]:
sample = pd.read_pickle("data/ogle3_only/scaled/s2_5k.pkl")
sample["o4_type"] = sample.id.apply(lambda i: o4_types(i))
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)

grouped = sample.groupby("tile")
data_small = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [6]:
cls = {0:0, 1:1}

## Small

In [7]:
%%time
data = data_small
data_name = "small"

rf = KFoldExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), clsnum=cls, 
    data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b278", nfolds=10)
collector.prop_fp(rf, data_name,"b278", "k-fold")

rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b278")
for result in rf:
    collector.prop_fp(result, data_name,"b278", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b261")
for result in rf:
    collector.prop_fp(result, data_name,"b261", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf(["b278", "b261"])
for result in rf:
    collector.prop_fp(result, data_name,"b261 + b278", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf(["b278", "b261", "b264"])
for result in rf:
    collector.prop_fp(result, data_name,"b261 + b264 + b278", result.test_name)

             precision    recall  f1-score   support

        0.0       0.98      0.99      0.98      2489
        1.0       0.94      0.85      0.89       423

avg / total       0.97      0.97      0.97      2912

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b262 (TEST)
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      2495
          1       0.97      0.85      0.90       296

avg / total       0.98      0.98      0.98      2791

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b263 (TEST)
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      2500
          1       0.96      0.86      0.91       305

avg / total       0.98      0.98      0.98      2805

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b261 (TEST)
             precision   

## Medium

In [8]:
%%time
data = data_mid
data_name = "mid"

rf = KFoldExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), clsnum=cls, 
    data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b278", nfolds=10)
collector.prop_fp(rf, data_name,"b278", "k-fold")

rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b278")
for result in rf:
    collector.prop_fp(result, data_name,"b278", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b261")
for result in rf:
    collector.prop_fp(result, data_name,"b261", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf(["b278", "b261"])
for result in rf:
    collector.prop_fp(result, data_name,"b261 + b278", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf(["b278", "b261", "b264"])
for result in rf:
    collector.prop_fp(result, data_name,"b261 + b264 + b278", result.test_name)

             precision    recall  f1-score   support

        0.0       0.98      0.99      0.99      4983
        1.0       0.92      0.78      0.85       423

avg / total       0.98      0.98      0.98      5406

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b262 (TEST)
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      4992
          1       0.97      0.79      0.87       296

avg / total       0.99      0.99      0.99      5288

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b263 (TEST)
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      4997
          1       0.97      0.78      0.86       305

avg / total       0.99      0.99      0.98      5302

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b261 (TEST)
             precision   

## Big

In [9]:
%%time
data = data_big
data_name = "big"

rf = KFoldExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), clsnum=cls, 
    data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b278", nfolds=10)
collector.prop_fp(rf, data_name,"b278", "k-fold")

rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b278")
for result in rf:
    collector.prop_fp(result, data_name,"b278", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf("b261")
for result in rf:
    collector.prop_fp(result, data_name,"b261", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf(["b278", "b261"])
for result in rf:
    collector.prop_fp(result, data_name,"b261 + b278", result.test_name)
    
rf = WithAnotherExperiment(
    clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), 
    clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
rf = rf(["b278", "b261", "b264"])
for result in rf:
    collector.prop_fp(result, data_name,"b261 + b264 + b278", result.test_name)

             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00     19931
        1.0       0.95      0.65      0.77       423

avg / total       0.99      0.99      0.99     20354

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b262 (TEST)
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     19951
          1       0.97      0.67      0.79       296

avg / total       0.99      0.99      0.99     20247

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b263 (TEST)
             precision    recall  f1-score   support

          0       0.99      1.00      1.00     19988
          1       0.93      0.67      0.77       305

avg / total       0.99      0.99      0.99     20293

--------------------------------------------------------------------------------
b278 (TRAIN) Vs. b261 (TEST)
             precision   

In [19]:
np.save("coso.npy", [collector])

In [8]:
collector = np.load("coso.npy")[0]

In [17]:
rows = [
    ["b278", [
        "k-fold",
        "b261",
        "b262",
        "b263",
        "b264"]],
    ["b261", [
        "b262",
        "b263",
        "b264",
        "b278"]],
    ["b261 + b278", [
        "b262",
        "b263",
        "b264"]],
    ["b261 + b264 + b278", [
        "b262",
        "b263"]]
]


In [18]:
collector.r

{('big', 'b261', 'b262'): (4.939003309132217e-05, 0.0),
 ('big', 'b261', 'b263'): (0.00039422460947124624, 0.0),
 ('big', 'b261', 'b264'): (0.00024643895707033366, 0.0),
 ('big', 'b261', 'b278'): (0.00029478235236317184, 0.0),
 ('big', 'b261 + b264 + b278', 'b262'): (4.939003309132217e-05, 0.0),
 ('big', 'b261 + b264 + b278', 'b263'): (0.0005420588380229636, 0.0),
 ('big', 'b261 + b278', 'b262'): (4.939003309132217e-05, 0.0),
 ('big', 'b261 + b278', 'b263'): (0.0004927807618390578, 0.0),
 ('big', 'b261 + b278', 'b264'): (0.0002957267484844004, 0.0),
 ('big', 'b278', 'b261'): (0.0001980884464913584, 4.95221116228396e-05),
 ('big', 'b278', 'b262'): (4.939003309132217e-05, 0.0),
 ('big', 'b278', 'b263'): (0.0005420588380229636, 0.0),
 ('big', 'b278', 'b264'): (0.0002957267484844004, 0.0),
 ('big', 'b278', 'k-fold'): (0.00029478235236317184, 0.0),
 ('mid', 'b261', 'b262'): (0.0, 0.0),
 ('mid', 'b261', 'b263'): (0.0003772161448509996, 0.0),
 ('mid', 'b261', 'b264'): (0.0001889644746787604, 

In [46]:
columns = {}
for size in "small mid big".split():
    props = []
    for train, tests in rows:
        for test in tests:
            key = (size, train, test)
            prop = collector.r[key][0]
            props.append(prop)
    columns[size] = props
    
trainc, testc = [], []
for train, tests in rows:
    for test in tests:
        trainc.append(train)
        testc.append(test)
        train = ""
columns.update(train=trainc, test=testc)
    

In [47]:
print pd.DataFrame(columns)["train test small mid big".split()].to_latex(index=False)

\begin{tabular}{llrrr}
\toprule
              train &    test &     small &       mid &       big \\
\midrule
               b278 &  k-fold &  0.001030 &  0.000555 &  0.000295 \\
                    &    b261 &  0.001472 &  0.000767 &  0.000198 \\
                    &    b262 &  0.000358 &  0.000189 &  0.000049 \\
                    &    b263 &  0.000357 &  0.000377 &  0.000542 \\
                    &    b264 &  0.000716 &  0.000189 &  0.000296 \\
               b261 &    b262 &  0.000358 &  0.000000 &  0.000049 \\
                    &    b263 &  0.000357 &  0.000377 &  0.000394 \\
                    &    b264 &  0.000716 &  0.000189 &  0.000246 \\
                    &    b278 &  0.001030 &  0.000370 &  0.000295 \\
        b261 + b278 &    b262 &  0.000358 &  0.000189 &  0.000049 \\
                    &    b263 &  0.000357 &  0.000377 &  0.000493 \\
                    &    b264 &  0.000716 &  0.000189 &  0.000296 \\
 b261 + b264 + b278 &    b262 &  0.000358 &  0.000189 &  0.000