In [68]:
from __future__ import print_function

import itertools as it

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

import pickle

import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

import joblib

from libs.container import Container
from libs.display import d
from libs.experiment import KFoldExperiment, WithAnotherExperiment, roc, metrics

In [3]:
cpu = joblib.cpu_count()

In [25]:
sample = pd.read_pickle("data/scaled/sample.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])

no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", "tile", "cls"] 
X_columns = [c for c in sample.columns if c not in no_features]

grouped = sample.groupby("tile")
data = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [21]:
RF_PARAMS = {
    'max_features': None, 'min_samples_split': 10, 'n_jobs': cpu, 
    'criterion': 'entropy', 'n_estimators': 500}

sX_columns = [
    'Beyond1Std',
    'Eta_e',
    'Freq1_harmonics_amplitude_0',
    'LinearTrend',
    'MaxSlope',
    'Mean',
    'Meanvariance',
    'Psi_eta',
    'Rcs',
    'c89_m2',
    'cnt',
    'n09_c3',
    'n09_hk_color',
    'n09_m2']

In [26]:
combs = {}
for ta, tb in it.combinations(data.keys(), 2):
    k = "{}_{}".format(ta, tb)
    df = pd.concat([data[ta], data[tb]])
    
    cls = {name: idx for idx, name in enumerate(df.tile.unique())}
    df["cls"] = df.tile.apply(cls.get)
    
    combs[k] = df

data = Container(combs)
del combs

cls = {0: 0, 1: 1}

In [28]:
%%time
results = {}
for c in data.keys():
    print("{} vs {}".format(*c.split("_")))
    rf = KFoldExperiment(
        clf=RandomForestClassifier(**RF_PARAMS), clsnum=cls, 
        data=data, pcls=1, ncls=0, X_columns=sX_columns, y_column="cls")
    rf = rf(c, nfolds=10)
    results[c] = rf

b396 vs b264
              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89       999
         1.0       0.89      0.87      0.88      1000

   micro avg       0.89      0.89      0.89      1999
   macro avg       0.89      0.89      0.89      1999
weighted avg       0.89      0.89      0.89      1999

--------------------------------------------------------------------------------
b261 vs b248
              precision    recall  f1-score   support

         0.0       0.85      0.88      0.87       998
         1.0       0.88      0.84      0.86       998

   micro avg       0.86      0.86      0.86      1996
   macro avg       0.87      0.86      0.86      1996
weighted avg       0.87      0.86      0.86      1996

--------------------------------------------------------------------------------
b248 vs b220
              precision    recall  f1-score   support

         0.0       0.89      0.90      0.90       998
         1.0       0.90      0.89   

In [43]:
all_tiles = set(it.chain(*[k.split("_") for k in data.keys()]))
all_tiles

{'b220',
 'b234',
 'b247',
 'b248',
 'b261',
 'b262',
 'b263',
 'b264',
 'b277',
 'b278',
 'b396'}

In [57]:
sorted(all_tiles)

['b220',
 'b234',
 'b247',
 'b248',
 'b261',
 'b262',
 'b263',
 'b264',
 'b277',
 'b278',
 'b396']

In [71]:
rows = []
for k, r in sorted(results.items()):
    ta, tb = k.split("_")
    row = {
        "Tile A": ta, 
        "Tile B": tb,
        "Prec.": metrics.precision_score(r.y_test, r.predictions),
        "Recall": metrics.recall_score(r.y_test, r.predictions),
        "AUC": r.roc_auc}
    rows.append(row)

In [73]:
df = pd.DataFrame(rows)[["Tile A", "Tile B", "Prec.", "Recall", "AUC"]]
df

Unnamed: 0,Tile A,Tile B,Prec.,Recall,AUC
0,b234,b220,0.909828,0.898,0.970149
1,b234,b277,0.90628,0.941767,0.976709
2,b234,b278,0.909268,0.937626,0.978964
3,b247,b220,0.918511,0.913,0.974521
4,b247,b234,0.71752,0.729,0.822018
5,b247,b264,0.898785,0.888,0.964052
6,b247,b277,0.880383,0.923695,0.968849
7,b247,b278,0.892822,0.913481,0.97179
8,b248,b220,0.898288,0.892,0.963729
9,b248,b234,0.722763,0.743,0.813862


In [74]:
df.describe()

Unnamed: 0,Prec.,Recall,AUC
count,55.0,55.0,55.0
mean,0.868842,0.870562,0.939211
std,0.074306,0.077063,0.073105
min,0.568089,0.559,0.591133
25%,0.855524,0.871,0.945619
50%,0.892822,0.893,0.963729
75%,0.910424,0.919598,0.971635
max,0.947531,0.946787,0.983777
