# Escalado

In [1]:
import pickle

In [2]:
import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.preprocessing import StandardScaler

In [26]:
s20k = pd.read_pickle("data/o3o4vZ/s20K.pkl.bz2")
s2_5k = pd.read_pickle("data/o3o4vZ/s5K.pkl.bz2")
s5k = pd.read_pickle("data/o3o4vZ/s2_5K.pkl.bz2")

## Removemos los features que no queremos


In [6]:
to_drop = [
    'Gskew', # has nan in the rlyrae stars
    "scls_h", "scls_j", "scls_k"  # no nos sirve
] + [s for s in s20k.columns if s.startswith("Freq2_") or s.startswith("Freq3_") ] # only the first period is important
to_keep = [c for c in s20k.columns if c not in to_drop]

s20k = s20k[to_keep]
s5k = s5k[to_keep]
s2_5k = s2_5k = s2_5k[to_keep]

s20k["id"].apply(lambda i: "b" + str(i)[1:4]).unique()

array(['b262', 'b263', 'b261', 'b264', 'b278', 'b247', 'b248', 'b277',
       'b234'], dtype=object)

Removemos filas que tengan un nan en `period_fit` pero antes nos fijamos que ninguna sea una RR-Lyrae

In [7]:
print s20k[s20k.Period_fit.isnull()].vs_type.unique()
print s5k[s5k.Period_fit.isnull()].vs_type.unique()
print s2_5k[s2_5k.Period_fit.isnull()].vs_type.unique()

['ECL-C' '' 'ECL-NC']
['ECL-C' '' 'ECL-NC']
['ECL-C' '' 'ECL-NC']


In [8]:
s20k = s20k.dropna()
s5k = s5k.dropna()
s2_5k = s2_5k.dropna()

In [9]:
s20k["id"].apply(lambda i: "b" + str(i)[1:4]).unique()

array(['b262', 'b263', 'b261', 'b264', 'b278', 'b247', 'b248', 'b277',
       'b234'], dtype=object)

Evaluamos que columnas tienen un valor infinito

In [10]:
for x in s20k.columns:
    if s20k[x].dtype == object:
        continue
    if np.isinf(s20k[x].values).sum() + np.isinf(s5k[x].values).sum() + np.isinf(s2_5k[x].values).sum():
        print x

Period_fit


Como period_fit es un feature que me interesa, verificamos que cantidad de filas son las afectadas
y si hay alguna con RRLyraes

In [11]:
print s2_5k[np.isinf(s2_5k.Period_fit.values)].shape
print s5k[np.isinf(s5k.Period_fit.values)].shape
print s20k[np.isinf(s20k.Period_fit.values)].shape

print s2_5k[np.isinf(s2_5k.Period_fit.values)].vs_type.unique()
print s5k[np.isinf(s5k.Period_fit.values)].vs_type.unique()
print s20k[np.isinf(s20k.Period_fit.values)].vs_type.unique()

(12, 64)
(12, 64)
(43, 64)
['ECL-NC' '']
['ECL-NC' '']
['ECL-NC' '']


Son pocas filas y no hay rrlyraes... las borramos

In [12]:
s20k = s20k[~np.isinf(s20k.Period_fit.values)]
s5k = s5k[~np.isinf(s5k.Period_fit.values)]
s2_5k = s2_5k[~np.isinf(s2_5k.Period_fit.values)]

ahora almacenamos tod esto limpio para futuros usos

In [13]:
s20k.to_pickle("data/o3o4vZ/nonull/s20k.pkl.bz2", compression="bz2")
s5k.to_pickle("data/o3o4vZ/nonull/s5k.pkl.bz2", compression="bz2")
s2_5k.to_pickle("data/o3o4vZ/nonull/s2_5k.pkl.bz2", compression="bz2")

## Ahora normalizamos

In [14]:
no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", ] 
X_columns = [c for c in s20k.columns if c not in no_features]

In [15]:
# X_columns

In [16]:
splited = {idx+1: list(s) for idx, s in enumerate(np.array_split(X_columns, 19))}
maxs = max(map(len, splited.values()))
for v in splited.values():
    while len(v) < maxs:
        v.append("")

In [17]:
# feats = pd.DataFrame(splited).T
# feats
# print feats.to_latex(index=False)

In [18]:
scaler_20k = StandardScaler()
norm_s20k = s20k.copy()
norm_s20k[X_columns] = scaler_20k.fit_transform(s20k[X_columns])
pickle.dump(scaler_20k, open("data/o3o4vZ/scalers/scaler_20k.pkl", "wb"))
norm_s20k.to_pickle("data/o3o4vZ/scaled/s20k.pkl.bz2", compression="bz2")

  x = um.multiply(x, x, out=x)


In [19]:
scaler_5k = StandardScaler()
norm_s5k = s5k.copy()
norm_s5k[X_columns] = scaler_5k.fit_transform(s5k[X_columns])
pickle.dump(scaler_5k, open("data/o3o4vZ/scalers/scaler_5k.pkl", "wb"))
norm_s5k.to_pickle("data/o3o4vZ/scaled/s5k.pkl.bz2", compression="bz2")

In [20]:
scaler_2_5k = StandardScaler()
norm_s2_5k = s2_5k.copy()
norm_s2_5k[X_columns] = scaler_2_5k.fit_transform(s2_5k[X_columns])
pickle.dump(scaler_2_5k, open("data/o3o4vZ/scalers/scaler_2_5k.pkl", "wb"))
norm_s2_5k.to_pickle("data/o3o4vZ/scaled/s2_5k.pkl.bz2", compression="bz2")

In [21]:
rows = {k: {"Tile": k} for k in "b261 b262 b263 b264 b278 b277 b247 b248 b234".split()}
    
for nombre, s in zip(["Chica", "Mediana", "Grande"] , [s2_5k,s5k,s20k]):
    s = s.copy()
    s["tile"] = s["id"].apply(lambda i: "b" + str(i)[1:4])
    grouped = s.groupby("tile")
    data = {k: grouped.get_group(k).copy() for k in grouped.groups.keys()}
    for tile, df in data.items():
        row = rows[tile]
        row.update({
            nombre : len(df[df.vs_type.str.contains("")]),
            "RR-Lyrae": len(df[df.vs_type.str.startswith("RRLyr")]),
        })
        



In [22]:
s = s20k.copy()
s["tile"] = s["id"].apply(lambda i: "b" + str(i)[1:4])

In [23]:
s.tile.unique()

array(['b262', 'b263', 'b261', 'b264', 'b278', 'b247', 'b248', 'b277',
       'b234'], dtype=object)

In [24]:
for r in rows.values():
    print r

{'Mediana': 4070, 'Chica': 6568, 'Grande': 21557, 'Tile': 'b247', 'RR-Lyrae': 192}
{'Mediana': 4521, 'Chica': 7017, 'Grande': 21997, 'Tile': 'b248', 'RR-Lyrae': 222}
{'Mediana': 5465, 'Chica': 7957, 'Grande': 22929, 'Tile': 'b262', 'RR-Lyrae': 318}
{'Mediana': 5572, 'Chica': 8071, 'Grande': 23067, 'Tile': 'b263', 'RR-Lyrae': 319}
{'Mediana': 5407, 'Chica': 7902, 'Grande': 22883, 'Tile': 'b261', 'RR-Lyrae': 253}
{'Mediana': 5699, 'Chica': 8198, 'Grande': 23196, 'Tile': 'b264', 'RR-Lyrae': 312}
{'Mediana': 3281, 'Chica': 5779, 'Grande': 20777, 'Tile': 'b234', 'RR-Lyrae': 126}
{'Mediana': 6463, 'Chica': 8964, 'Grande': 23896, 'Tile': 'b277', 'RR-Lyrae': 434}
{'Mediana': 7058, 'Chica': 9552, 'Grande': 24509, 'Tile': 'b278', 'RR-Lyrae': 441}


In [25]:
print pd.DataFrame(
    [v for v in rows.values()]
)[
    "Tile Chica Mediana Grande RR-Lyrae".split()
].sort_values("Tile").to_latex(index=False)

\begin{tabular}{lrrrr}
\toprule
 Tile &  Chica &  Mediana &  Grande &  RR-Lyrae \\
\midrule
 b234 &   5779 &     3281 &   20777 &       126 \\
 b247 &   6568 &     4070 &   21557 &       192 \\
 b248 &   7017 &     4521 &   21997 &       222 \\
 b261 &   7902 &     5407 &   22883 &       253 \\
 b262 &   7957 &     5465 &   22929 &       318 \\
 b263 &   8071 &     5572 &   23067 &       319 \\
 b264 &   8198 &     5699 &   23196 &       312 \\
 b277 &   8964 &     6463 &   23896 &       434 \\
 b278 &   9552 &     7058 &   24509 &       441 \\
\bottomrule
\end{tabular}

