# Escalado

In [2]:
import pickle

In [3]:
import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.preprocessing import StandardScaler

In [4]:
s20k = pd.read_pickle("data/ogle3_only/sample_ogle3_20000.pkl.bz2")
s2_5k = pd.read_pickle("data/ogle3_only/sample_ogle3_2500.pkl.bz2")
s5k = pd.read_pickle("data/ogle3_only/sample_ogle3_5000.pkl.bz2")

## Removemos los features que no queremos


In [5]:
s20k["AmplitudeJH"] = s20k.AmplitudeJ - s20k.AmplitudeH
s20k["AmplitudeJK"] = s20k.AmplitudeJ - s20k.Amplitude

s5k["AmplitudeJH"] = s5k.AmplitudeJ - s5k.AmplitudeH
s5k["AmplitudeJK"] = s5k.AmplitudeJ - s5k.Amplitude

s2_5k["AmplitudeJH"] = s2_5k.AmplitudeJ - s2_5k.AmplitudeH
s2_5k["AmplitudeJK"] = s2_5k.AmplitudeJ - s2_5k.Amplitude

In [6]:
to_drop = [
    'Gskew', # has nan in the rlyrae stars
    "scls_h", "scls_j", "scls_k",  # no nos sirve
    "AndersonDarling", "StetsonJ", "StetsonK"
] + [s for s in s20k.columns if s.startswith("Freq2_") or s.startswith("Freq3_") ] # only the first period is important
to_keep = [c for c in s20k.columns if c not in to_drop]

s20k = s20k[to_keep]
s5k = s5k[to_keep]
s2_5k = s2_5k = s2_5k[to_keep]

Removemos filas que tengan un nan en `period_fit` pero antes nos fijamos que ninguna sea una RR-Lyrae

In [7]:
print s20k[s20k.Period_fit.isnull()].vs_type.unique()
print s5k[s5k.Period_fit.isnull()].vs_type.unique()
print s2_5k[s2_5k.Period_fit.isnull()].vs_type.unique()

['']
['']
['']


In [8]:
s20k = s20k.dropna()
s5k = s5k.dropna()
s2_5k = s2_5k.dropna()

Evaluamos que columnas tienen un valor infinito

In [9]:
for x in s20k.columns:
    if s20k[x].dtype == object:
        continue
    if np.isinf(s20k[x].values).sum() + np.isinf(s5k[x].values).sum() + np.isinf(s2_5k[x].values).sum():
        print x

Period_fit


Como period_fit es un feature que me interesa, verificamos que cantidad de filas son las afectadas
y si hay alguna con RRLyraes

In [10]:
print s2_5k[np.isinf(s2_5k.Period_fit.values)].shape
print s5k[np.isinf(s5k.Period_fit.values)].shape
print s20k[np.isinf(s20k.Period_fit.values)].shape

print s2_5k[np.isinf(s2_5k.Period_fit.values)].vs_type.unique()
print s5k[np.isinf(s5k.Period_fit.values)].vs_type.unique()
print s20k[np.isinf(s20k.Period_fit.values)].vs_type.unique()

(2, 62)
(6, 62)
(26, 62)
['']
['']
['']


Son pocas filas y no hay rrlyraes... las borramos

In [11]:
s20k = s20k[~np.isinf(s20k.Period_fit.values)]
s5k = s5k[~np.isinf(s5k.Period_fit.values)]
s2_5k = s2_5k[~np.isinf(s2_5k.Period_fit.values)]

ahora almacenamos tod esto limpio para futuros usos

In [11]:
s20k.to_pickle("data/ogle3_only/nonull/s20k.pkl.bz2", compression="bz2")
s5k.to_pickle("data/ogle3_only/nonull/s5k.pkl.bz2", compression="bz2")
s2_5k.to_pickle("data/ogle3_only/nonull/s2_5k.pkl.bz2", compression="bz2")

## Ahora normalizamos

In [12]:
no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", ] 
X_columns = [c for c in s20k.columns if c not in no_features]

In [24]:
splited = {idx+1: list(s) for idx, s in enumerate(np.array_split(sorted(X_columns), 15))}
maxs = max(map(len, splited.values()))
for v in splited.values():
    while len(v) < maxs:
        v.append("")

In [25]:
feats = pd.DataFrame(splited).T
feats
print feats.to_latex(index=False)

\begin{tabular}{llll}
\toprule
                           0 &                                1 &                            2 &                            3 \\
\midrule
                   Amplitude &                       AmplitudeH &                   AmplitudeJ &                  AmplitudeJH \\
                 AmplitudeJK &                   Autocor\_length &                   Beyond1Std &                     CAR\_mean \\
                   CAR\_sigma &                          CAR\_tau &                          Con &                        Eta\_e \\
    FluxPercentileRatioMid20 &         FluxPercentileRatioMid35 &     FluxPercentileRatioMid50 &     FluxPercentileRatioMid65 \\
    FluxPercentileRatioMid80 &      Freq1\_harmonics\_amplitude\_0 &  Freq1\_harmonics\_amplitude\_1 &  Freq1\_harmonics\_amplitude\_2 \\
 Freq1\_harmonics\_amplitude\_3 &      Freq1\_harmonics\_rel\_phase\_0 &  Freq1\_harmonics\_rel\_phase\_1 &  Freq1\_harmonics\_rel\_phase\_2 \\
 Freq1\_harmonics\_rel\_phas

In [15]:
scaler_20k = StandardScaler()
norm_s20k = s20k.copy()
norm_s20k[X_columns] = scaler_20k.fit_transform(s20k[X_columns])
pickle.dump(scaler_20k, open("data/ogle3_only/scalers/scaler_20k.pkl", "wb"))
norm_s20k.to_pickle("data/ogle3_only/scaled/s20k.pkl.bz2", compression="bz2")

  x = um.multiply(x, x, out=x)


In [16]:
scaler_5k = StandardScaler()
norm_s5k = s5k.copy()
norm_s5k[X_columns] = scaler_5k.fit_transform(s5k[X_columns])
pickle.dump(scaler_5k, open("data/ogle3_only/scalers/scaler_5k.pkl", "wb"))
norm_s5k.to_pickle("data/ogle3_only/scaled/s5k.pkl.bz2", compression="bz2")

In [17]:
scaler_2_5k = StandardScaler()
norm_s2_5k = s2_5k.copy()
norm_s2_5k[X_columns] = scaler_2_5k.fit_transform(s2_5k[X_columns])
pickle.dump(scaler_2_5k, open("data/ogle3_only/scalers/scaler_2_5k.pkl", "wb"))
norm_s2_5k.to_pickle("data/ogle3_only/scaled/s2_5k.pkl.bz2", compression="bz2")

In [18]:
rows = {k: {"Tile": k} for k in "b261 b262 b263 b264 b278".split()}
    
for nombre, s in zip(["Chica", "Mediana", "Grande"] , [s2_5k,s5k,s20k]):
    s = s.copy()
    s["tile"] = s["id"].apply(lambda i: "b" + str(i)[1:4])
    grouped = s.groupby("tile")
    data = {k: grouped.get_group(k).copy() for k in grouped.groups.keys()}
    for tile, df in data.items():
        row = rows[tile]
        row.update({
            nombre : len(df[df.vs_type.str.contains("")]),
            "RR-Lyrae": len(df[df.vs_type.str.startswith("RRLyr")]),
        })

In [19]:
print pd.DataFrame(
    [v for v in rows.values()]
)[
    "Tile Chica Mediana Grande RR-Lyrae".split()
].sort_values("Tile").to_latex(index=False)

\begin{tabular}{lrrrr}
\toprule
 Tile &  Chica &  Mediana &  Grande &  RR-Lyrae \\
\midrule
 b261 &   2718 &     5212 &   20193 &       221 \\
 b262 &   2791 &     5288 &   20247 &       296 \\
 b263 &   2805 &     5302 &   20293 &       305 \\
 b264 &   2792 &     5292 &   20289 &       294 \\
 b278 &   2912 &     5406 &   20354 &       423 \\
\bottomrule
\end{tabular}

