# Escalado

In [1]:
import pickle

In [2]:
import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.preprocessing import StandardScaler

In [3]:
s2_5k = pd.read_pickle("data/o3o4vZ/s2_5K.pkl.bz2")
s5k = pd.read_pickle("data/o3o4vZ/s5K.pkl.bz2")
s20k = pd.read_pickle("data/o3o4vZ/s20K.pkl.bz2")

## Removemos todo lo que no sea RRLyrae o desconocido

In [4]:
def filter_only_rr_unk(df):
    flt = (df.vs_type == "") | df.vs_type.str.startswith('RRLyr-')
    return df[flt]

s2_5k = filter_only_rr_unk(s2_5k)
s5k = filter_only_rr_unk(s5k)
s20k = filter_only_rr_unk(s20k)

## Removemos los features que no queremos


In [5]:
to_drop = [
    'Gskew', # has nan in the rlyrae stars
    "scls_h", "scls_j", "scls_k"  # no nos sirve
] + [s for s in s20k.columns if s.startswith("Freq2_") or s.startswith("Freq3_") ] # only the first period is important
to_keep = [c for c in s20k.columns if c not in to_drop]

s20k = s20k[to_keep]
s5k = s5k[to_keep]
s2_5k = s2_5k = s2_5k[to_keep]

s20k["id"].apply(lambda i: "b" + str(i)[1:4]).unique()

array(['b262', 'b263', 'b261', 'b264', 'b278', 'b247', 'b248', 'b277',
       'b234'], dtype=object)

Removemos filas que tengan un nan en `period_fit` pero antes nos fijamos que ninguna sea una RR-Lyrae

In [6]:
print s20k[s20k.Period_fit.isnull()].vs_type.unique()
print s5k[s5k.Period_fit.isnull()].vs_type.unique()
print s2_5k[s2_5k.Period_fit.isnull()].vs_type.unique()

['']
['']
['']


In [7]:
s20k = s20k.dropna()
s5k = s5k.dropna()
s2_5k = s2_5k.dropna()

In [8]:
s20k["id"].apply(lambda i: "b" + str(i)[1:4]).unique()

array(['b262', 'b263', 'b261', 'b264', 'b278', 'b247', 'b248', 'b277',
       'b234'], dtype=object)

Evaluamos que columnas tienen un valor infinito

In [9]:
for x in s20k.columns:
    if s20k[x].dtype == object:
        continue
    if np.isinf(s20k[x].values).sum() + np.isinf(s5k[x].values).sum() + np.isinf(s2_5k[x].values).sum():
        print x

Period_fit


Como period_fit es un feature que me interesa, verificamos que cantidad de filas son las afectadas
y si hay alguna con RRLyraes

In [10]:
print s2_5k[np.isinf(s2_5k.Period_fit.values)].shape
print s5k[np.isinf(s5k.Period_fit.values)].shape
print s20k[np.isinf(s20k.Period_fit.values)].shape

print s2_5k[np.isinf(s2_5k.Period_fit.values)].vs_type.unique()
print s5k[np.isinf(s5k.Period_fit.values)].vs_type.unique()
print s20k[np.isinf(s20k.Period_fit.values)].vs_type.unique()

(5, 64)
(5, 64)
(36, 64)
['']
['']
['']


Son pocas filas y no hay rrlyraes... las borramos

In [11]:
s20k = s20k[~np.isinf(s20k.Period_fit.values)]
s5k = s5k[~np.isinf(s5k.Period_fit.values)]
s2_5k = s2_5k[~np.isinf(s2_5k.Period_fit.values)]

ahora almacenamos tod esto limpio para futuros usos

In [12]:
s20k.to_pickle("data/o3o4vZ/nonull/s20k.pkl.bz2", compression="bz2")
s5k.to_pickle("data/o3o4vZ/nonull/s5k.pkl.bz2", compression="bz2")
s2_5k.to_pickle("data/o3o4vZ/nonull/s2_5k.pkl.bz2", compression="bz2")

## Ahora normalizamos

In [13]:
no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", ] 
X_columns = [c for c in s20k.columns if c not in no_features]

In [14]:
# X_columns

In [15]:
splited = {idx+1: list(s) for idx, s in enumerate(np.array_split(X_columns, 19))}
maxs = max(map(len, splited.values()))
for v in splited.values():
    while len(v) < maxs:
        v.append("")

In [16]:
# feats = pd.DataFrame(splited).T
# feats
# print feats.to_latex(index=False)

In [17]:
scaler_20k = StandardScaler()
norm_s20k = s20k.copy()
norm_s20k[X_columns] = scaler_20k.fit_transform(s20k[X_columns])
pickle.dump(scaler_20k, open("data/o3o4vZ/scalers/scaler_20k.pkl", "wb"))
norm_s20k.to_pickle("data/o3o4vZ/scaled/s20k.pkl.bz2", compression="bz2")

In [18]:
scaler_5k = StandardScaler()
norm_s5k = s5k.copy()
norm_s5k[X_columns] = scaler_5k.fit_transform(s5k[X_columns])
pickle.dump(scaler_5k, open("data/o3o4vZ/scalers/scaler_5k.pkl", "wb"))
norm_s5k.to_pickle("data/o3o4vZ/scaled/s5k.pkl.bz2", compression="bz2")

In [19]:
scaler_2_5k = StandardScaler()
norm_s2_5k = s2_5k.copy()
norm_s2_5k[X_columns] = scaler_2_5k.fit_transform(s2_5k[X_columns])
pickle.dump(scaler_2_5k, open("data/o3o4vZ/scalers/scaler_2_5k.pkl", "wb"))
norm_s2_5k.to_pickle("data/o3o4vZ/scaled/s2_5k.pkl.bz2", compression="bz2")

In [20]:
rows = {k: {"Tile": k} for k in "b261 b262 b263 b264 b278 b277 b247 b248 b234".split()}
    
for nombre, s in zip(["Chica", "Mediana", "Grande"] , [s2_5k,s5k,s20k]):
    s = s.copy()
    s["tile"] = s["id"].apply(lambda i: "b" + str(i)[1:4])
    grouped = s.groupby("tile")
    data = {k: grouped.get_group(k).copy() for k in grouped.groups.keys()}
    for tile, df in data.items():
        row = rows[tile]
        row.update({
            nombre : len(df[df.vs_type.str.contains("")]),
            "RR-Lyrae": len(df[df.vs_type.str.startswith("RRLyr")]),
        })
        



In [21]:
s = s20k.copy()
s["tile"] = s["id"].apply(lambda i: "b" + str(i)[1:4])

In [22]:
s.tile.unique()

array(['b262', 'b263', 'b261', 'b264', 'b278', 'b247', 'b248', 'b277',
       'b234'], dtype=object)

In [23]:
for r in rows.values():
    print r

{'Mediana': 5188, 'Chica': 2690, 'Grande': 20177, 'Tile': 'b247', 'RR-Lyrae': 192}
{'Mediana': 5217, 'Chica': 2721, 'Grande': 20197, 'Tile': 'b248', 'RR-Lyrae': 222}
{'Mediana': 5307, 'Chica': 2815, 'Grande': 20279, 'Tile': 'b262', 'RR-Lyrae': 318}
{'Mediana': 5317, 'Chica': 2818, 'Grande': 20313, 'Tile': 'b263', 'RR-Lyrae': 319}
{'Mediana': 5246, 'Chica': 2751, 'Grande': 20227, 'Tile': 'b261', 'RR-Lyrae': 253}
{'Mediana': 5310, 'Chica': 2811, 'Grande': 20308, 'Tile': 'b264', 'RR-Lyrae': 312}
{'Mediana': 5123, 'Chica': 2625, 'Grande': 20121, 'Tile': 'b234', 'RR-Lyrae': 126}
{'Mediana': 5425, 'Chica': 2924, 'Grande': 20357, 'Tile': 'b277', 'RR-Lyrae': 434}
{'Mediana': 5424, 'Chica': 2930, 'Grande': 20381, 'Tile': 'b278', 'RR-Lyrae': 441}


In [24]:
print pd.DataFrame(
    [v for k, v in sorted(rows.items())]
)[
    "Tile Chica Mediana Grande RR-Lyrae".split()
].sort_values("Tile").to_latex(index=False)

\begin{tabular}{lrrrr}
\toprule
 Tile &  Chica &  Mediana &  Grande &  RR-Lyrae \\
\midrule
 b234 &   2625 &     5123 &   20121 &       126 \\
 b247 &   2690 &     5188 &   20177 &       192 \\
 b248 &   2721 &     5217 &   20197 &       222 \\
 b261 &   2751 &     5246 &   20227 &       253 \\
 b262 &   2815 &     5307 &   20279 &       318 \\
 b263 &   2818 &     5317 &   20313 &       319 \\
 b264 &   2811 &     5310 &   20308 &       312 \\
 b277 &   2924 &     5425 &   20357 &       434 \\
 b278 &   2930 &     5424 &   20381 &       441 \\
\bottomrule
\end{tabular}

