# Escalado

In [1]:
import pickle

In [2]:
import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.preprocessing import StandardScaler

In [3]:
s20k = pd.read_pickle("data/ogle3_only/sample_ogle3_20000.pkl")
s2_5k = pd.read_pickle("data/ogle3_only/sample_ogle3_2500.pkl")
s5k = pd.read_pickle("data/ogle3_only/sample_ogle3_5000.pkl")

## Removemos los features que no queremos


In [4]:
to_drop = [
    'Gskew', # has nan in the rlyrae stars
    "scls_h", "scls_j", "scls_k"  # no nos sirve
] + [s for s in s20k.columns if s.startswith("Freq2_") or s.startswith("Freq3_") ] # only the first period is important
to_keep = [c for c in s20k.columns if c not in to_drop]

s20k = s20k[to_keep]
s5k = s5k[to_keep]
s2_5k = s2_5k = s2_5k[to_keep]

Removemos filas que tengan un nan en `period_fit` pero antes nos fijamos que ninguna sea una RR-Lyrae

In [5]:
print s20k[s20k.Period_fit.isnull()].vs_type.unique()
print s5k[s5k.Period_fit.isnull()].vs_type.unique()
print s2_5k[s2_5k.Period_fit.isnull()].vs_type.unique()

['']
['']
['']


In [6]:
s20k = s20k.dropna()
s5k = s5k.dropna()
s2_5k = s2_5k.dropna()

Evaluamos que columnas tienen un valor infinito

In [7]:
for x in s20k.columns:
    if s20k[x].dtype == object:
        continue
    if np.isinf(s20k[x].values).sum() + np.isinf(s5k[x].values).sum() + np.isinf(s2_5k[x].values).sum():
        print x

Period_fit


Como period_fit es un feature que me interesa, verificamos que cantidad de filas son las afectadas
y si hay alguna con RRLyraes

In [8]:
print s2_5k[np.isinf(s2_5k.Period_fit.values)].shape
print s5k[np.isinf(s5k.Period_fit.values)].shape
print s20k[np.isinf(s20k.Period_fit.values)].shape

print s2_5k[np.isinf(s2_5k.Period_fit.values)].vs_type.unique()
print s5k[np.isinf(s5k.Period_fit.values)].vs_type.unique()
print s20k[np.isinf(s20k.Period_fit.values)].vs_type.unique()

(2, 62)
(6, 62)
(26, 62)
['']
['']
['']


Son pocas filas y no hay rrlyraes... las borramos

In [9]:
s20k = s20k[~np.isinf(s20k.Period_fit.values)]
s5k = s5k[~np.isinf(s5k.Period_fit.values)]
s2_5k = s2_5k[~np.isinf(s2_5k.Period_fit.values)]

## Ahora normalizamos

In [10]:
no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", ] 
X_columns = [c for c in s20k.columns if c not in no_features]
print "Features:\n\t", ", ".join(X_columns)

Features:
	cnt, Amplitude, AndersonDarling, Autocor_length, Beyond1Std, CAR_mean, CAR_sigma, CAR_tau, Con, Eta_e, FluxPercentileRatioMid20, FluxPercentileRatioMid35, FluxPercentileRatioMid50, FluxPercentileRatioMid65, FluxPercentileRatioMid80, Freq1_harmonics_amplitude_0, Freq1_harmonics_amplitude_1, Freq1_harmonics_amplitude_2, Freq1_harmonics_amplitude_3, Freq1_harmonics_rel_phase_0, Freq1_harmonics_rel_phase_1, Freq1_harmonics_rel_phase_2, Freq1_harmonics_rel_phase_3, LinearTrend, MaxSlope, Mean, Meanvariance, MedianAbsDev, MedianBRP, PairSlopeTrend, PercentAmplitude, PercentDifferenceFluxPercentile, PeriodLS, Period_fit, Psi_CS, Psi_eta, Q31, Rcs, Skew, SmallKurtosis, Std, StetsonK, c89_jk_color, c89_hk_color, c89_jh_color, n09_jk_color, n09_hk_color, n09_jh_color, c89_m2, c89_m4, c89_c3, n09_m2, n09_m4, n09_c3, AmplitudeH, AmplitudeJ, ppmb


In [12]:
scaler_20k = StandardScaler()
norm_s20k = s20k.copy()
norm_s20k[X_columns] = scaler_20k.fit_transform(s20k[X_columns])
pickle.dump(scaler_20k, open("data/ogle3_only/scalers/scaler_20k.pkl", "wb"))
norm_s20k.to_pickle("data/ogle3_only/scaled/s20k.pkl")

In [13]:
scaler_5k = StandardScaler()
norm_s5k = s5k.copy()
norm_s5k[X_columns] = scaler_5k.fit_transform(s5k[X_columns])
pickle.dump(scaler_5k, open("data/ogle3_only/scalers/scaler_5k.pkl", "wb"))
norm_s5k.to_pickle("data/ogle3_only/scaled/s5k.pkl")

In [14]:
scaler_2_5k = StandardScaler()
norm_s2_5k = s2_5k.copy()
norm_s2_5k[X_columns] = scaler_2_5k.fit_transform(s2_5k[X_columns])
pickle.dump(scaler_2_5k, open("data/ogle3_only/scalers/scaler_2_5k.pkl", "wb"))
norm_s2_5k.to_pickle("data/ogle3_only/scaled/s2_5k.pkl")