In [1]:
#Run with python 2.7
import numpy as np
import shutil
import os
import glob
import pandas as pd
import matplotlib.pylab as plt
import pickle
import sys
% matplotlib inline

from decimal import Decimal, ROUND_HALF_UP
from george import kernels
import george

# Root directory of the project
ROOT_DIR = os.path.abspath("../code")
# Import code
sys.path.append(ROOT_DIR)

from hierarchicalmodel import *
from rasle import *
from preprocessing import *
from GPFit import *

plt.rc('text', usetex=True)
plt.rc('font',**{'family':'serif','serif':['Palatino']})
figSize  = (12, 8)
fontSize = 20



In [2]:
# This Section Needs to be changed each time
nFeatures = 7 # Number of features considered

# All Labels of the different variabe stars
true_class_1=1;true_class_2=2;true_class_3=3;true_class_4=4;true_class_5=5;true_class_6=6;true_class_7=7;\
true_class_8=8;true_class_9=9;true_class_10=10;true_class_11=11;true_class_12=12;true_class_13=13

eclipsing_label = 20;rotational_label = 21;pulsating_label = 22;RR_Lyrae_label = 23; LPV_label = 24;\
delta_scuti_label = 25; cepheids_label   = 26




### DATA AUGMENTATION AND FEATURE EXTRACTION

In [3]:
data_preparation = True
data_dir_train = '../data/training_phase_lc/'
data_dir_test  = '../data/test_phase_lc/'

# Loading the photometry data to get the filenames and their respective classes
ascii_data  = pd.read_csv('../data/catalogue/Ascii_SSS_Per_Table.txt',delim_whitespace=True,names = ["SSS_ID", "File_Name", "RA", "Dec", "Period", "V_CSS", "Npts", "V_amp", "Type", "Prior_ID", "No_Name1", 'No_Name2'])
ascii_files = ascii_data[['File_Name', 'Type']] # Selecting only the filename and the type of stars

'''
Peforming a downsammpling of Type 5- We downsample class 5 from 18000 to 4509. And we remove class 11 and 13.
'''

type_11     = ascii_files[ascii_files.Type==true_class_11]
type_13     = ascii_files[ascii_files.Type==true_class_13]

ascii_files = ascii_files.drop(type_11.index)
ascii_files = ascii_files.drop(type_13.index)



period_data                   = ascii_data[['Period', 'File_Name', 'Type']]# Use for Test set Period
periods                       = ascii_data[['File_Name', 'Period']] # Use for Test set Period
update_ascii_period           = pd.DataFrame(periods.File_Name.astype(str) + '_2') # Use for Training set Period
update_ascii_period['Period'] = periods.Period

if data_preparation: 
    X_training = phase_data_sets(data_dir_train,ascii_files)
    X_testing  = phase_data_sets(data_dir_test,ascii_files) 

    data_columns = ['magnitude', 'time']
    features     = ['Skew', 'Mean', 'Std', 'SmallKurtosis', 'Amplitude', 'Meanvariance']


    RRab_train        = stars_label(X_training, true_class_1, column_name='Type')
    RRc_train         = stars_label(X_training, true_class_2, column_name='Type') 
    RRd_train         = stars_label(X_training, true_class_3, column_name='Type')
    blazhko_train     = stars_label(X_training, true_class_4, column_name='Type')
    contact_Bi_train  = stars_label(X_training, true_class_5, column_name='Type')
    semi_det_Bi_train = stars_label(X_training, true_class_6, column_name='Type')
    rot_train         = stars_label(X_training, true_class_7, column_name='Type')
    LPV_train         = stars_label(X_training, true_class_8, column_name='Type')
    delta_scuti_train = stars_label(X_training, true_class_9, column_name='Type')
    ACEP_train        = stars_label(X_training, true_class_10, column_name='Type')
    cep_ii_train      = stars_label(X_training, true_class_12, column_name='Type')

    RRab_test        = stars_label(X_testing, true_class_1, column_name='Type')
    RRc_test         = stars_label(X_testing, true_class_2, column_name='Type') 
    RRd_test         = stars_label(X_testing, true_class_3, column_name='Type')
    blazhko_test     = stars_label(X_testing, true_class_4, column_name='Type')
    contact_Bi_test  = stars_label(X_testing, true_class_5, column_name='Type')
    semi_det_Bi_test = stars_label(X_testing, true_class_6, column_name='Type')
    rot_test         = stars_label(X_testing, true_class_7, column_name='Type')
    LPV_test         = stars_label(X_testing, true_class_8, column_name='Type')
    delta_scuti_test = stars_label(X_testing, true_class_9, column_name='Type')
    ACEP_test        = stars_label(X_testing, true_class_10, column_name='Type')
    cep_ii_test      = stars_label(X_testing, true_class_12, column_name='Type')
       
    '-----------------------------------------------------------------------------'
                                    # FIRST LAYER
    '-----------------------------------------------------------------------------'
    training_data_FL, testing_data_FL, y_FL_training_counts = first_layer(contact_Bi_train, semi_det_Bi_train,rot_train,RRab_train, RRc_train, RRd_train, blazhko_train, LPV_train, delta_scuti_train, ACEP_train, cep_ii_train,\
                                                            contact_Bi_test, semi_det_Bi_test,rot_test,RRab_test, RRc_test, RRd_test, blazhko_test, LPV_test, delta_scuti_test, ACEP_test, cep_ii_test,\
                                                            eclipsing_label,rotational_label,pulsating_label)

    # This part is calculating the number of times each class need to be augmented    
    ns_FL = num_augmentation(nAugmentation=17000, y_training_counts=y_FL_training_counts)#17000
    print(ns_FL)


    # Each class is augmented using their respective number of samples and features are extracted
    augmentation_data, feature_file = GP_augmentation_and_featureExtraction(data_dir=data_dir_train, \
                                          data_ = training_data_FL,data_columns=data_columns,period_data=period_data,\
                                          features=features,update_ascii_period=update_ascii_period,\
                                          number_of_samples=ns_FL,save_folder_training = '../data/GP/HC/layer1_EclRotPul/training_set/')

    # Features from each class from the test set are extracted and save in a single 
    feature_file_testSet    = GP_feature_extraction_test_set(data_dir=data_dir_test,period_data=period_data,periods=periods,\
                                  X_testing=testing_data_FL,data_columns=data_columns,\
                                  features=features,save_folder_test= '../data/GP/HC/layer1_EclRotPul/test_set/')

#     '-------------------------------------------------------------------------------'
#                             # SECOND LAYER ECLIPSING BINARY
#     '-------------------------------------------------------------------------------'
    training_data_SL_EB, testing_data_SL_EB, y_SL_EB_training_counts = second_layer_EB(contact_Bi_train,semi_det_Bi_train,contact_Bi_test,semi_det_Bi_test,true_class_5,true_class_6)


    # This part is calculating the number of times each class need to be augmented    
    ns_SL_EB = num_augmentation(nAugmentation=10000, y_training_counts=y_SL_EB_training_counts)#10000
    print(ns_SL_EB)


    # Each class is augmented using their respective number of samples and features are extracted
    augmentation_data_SL_EB, feature_file_SL_EB = GP_augmentation_and_featureExtraction(data_dir=data_dir_train, \
                                          data_ = training_data_SL_EB,data_columns=data_columns,period_data=period_data,\
                                          features=features,update_ascii_period=update_ascii_period,\
                                          number_of_samples=ns_SL_EB,save_folder_training = '../data/GP/HC/layer2_EB/training_set/')

    # Features from each class from the test set are extracted and save in a single 
    feature_file_testSet_SL_EB    = GP_feature_extraction_test_set(data_dir=data_dir_test,period_data=period_data,periods=periods,\
                                  X_testing=testing_data_SL_EB,data_columns=data_columns,\
                                  features=features,save_folder_test= '../data/GP/HC/layer2_EB/test_set/')

#     '-----------------------------------------------------------------------------'
#                         # SECOND LAYER RR LYRAE PULSATING LPV CEPHEIDS
#     '-----------------------------------------------------------------------------'
    training_data_SL_RLCD, testing_data_SL_RLCD, y_SL_RLCD_training_counts = second_layer_RLCD(RRab_train,RRc_train,RRd_train,blazhko_train,LPV_train,ACEP_train,cep_ii_train,delta_scuti_train,\
                                                                            RRab_test,RRc_test,RRd_test,blazhko_test,LPV_test,ACEP_test, cep_ii_test,delta_scuti_test,RR_Lyrae_label,\
                                                                            LPV_label,cepheids_label,delta_scuti_label)



    # This part is calculating the number of times each class need to be augmented    
    ns_SL_RLCD = num_augmentation(nAugmentation=17000, y_training_counts=y_SL_RLCD_training_counts)#17000
    print(ns_SL_RLCD)


    # Each class is augmented using their respective number of samples and features are extracted
    augmentation_data_SL_RLCD, feature_file_SL_RLCD = GP_augmentation_and_featureExtraction(data_dir=data_dir_train, \
                                          data_ = training_data_SL_RLCD,data_columns=data_columns,\
                                          features=features,update_ascii_period=update_ascii_period,period_data=period_data,\
                                          number_of_samples=ns_SL_RLCD,save_folder_training = '../data/GP/HC/layer2_RLCD/training_set/')

    # Features from each class from the test set are extracted and save in a single 
    feature_file_testSet_SL_RLCD    = GP_feature_extraction_test_set(data_dir=data_dir_test,period_data=period_data,periods=periods,\
                                  X_testing=testing_data_SL_RLCD,data_columns=data_columns,\
                                  features=features,save_folder_test= '../data/GP/HC/layer2_RLCD/test_set/')

#     '-----------------------------------------------------------------------------'
#                     # THIRD LAYER RR LYRAE: RRab, RRc, RRd, Blazhko
#     '-----------------------------------------------------------------------------'
    training_data_TL_RRLyrae, testing_data_TL_RRLyrae, y_TL_RRLyrae_training_counts = third_layer_RRLyrae(RRab_train,RRc_train,RRd_train,blazhko_train,RRab_test,RRc_test,RRd_test,blazhko_test,\
                                                                                      true_class_1,true_class_2,true_class_3,true_class_4)


    # This part is calculating the number of times each class need to be augmented    
    ns_TL_RRLyrae = num_augmentation(nAugmentation=10000, y_training_counts=y_TL_RRLyrae_training_counts)#10000
    print(ns_TL_RRLyrae)


    # Each class is augmented using their respective number of samples and features are extracted
    augmentation_data_TL_RRLyrae, feature_file_TL_RRLyrae = GP_augmentation_and_featureExtraction(data_dir=data_dir_train, \
                                          data_ = training_data_TL_RRLyrae,data_columns=data_columns,period_data=period_data,\
                                          features=features,update_ascii_period=update_ascii_period,\
                                          number_of_samples=ns_TL_RRLyrae,save_folder_training = '../data/GP/HC/layer3_RRLyrae/training_set/')

    # Features from each class from the test set are extracted and save in a single 
    feature_file_testSet_TL_RRLyrae    = GP_feature_extraction_test_set(data_dir=data_dir_test,period_data=period_data,periods=periods,\
                                  X_testing=testing_data_TL_RRLyrae,data_columns=data_columns,\
                                  features=features,save_folder_test= '../data/GP/HC/layer3_RRLyrae/test_set/')

    '-----------------------------------------------------------------------------'
                            # THIRD LAYER Cepheids: ACEP and Cep-II
    '-----------------------------------------------------------------------------'
    training_data_TL_cep, testing_data_TL_cep, y_TL_cep_training_counts = third_layer_Cepheids(ACEP_train,cep_ii_train,ACEP_test,cep_ii_test,true_class_10,true_class_12)

    # This part is calculating the number of times each class need to be augmented    
    ns_TL_cep = num_augmentation(nAugmentation=5000, y_training_counts=y_TL_cep_training_counts)#5000


    # Each class is augmented using their respective number of samples and features are extracted
    augmentation_data_TL_cep, feature_file_TL_cep = GP_augmentation_and_featureExtraction(data_dir=data_dir_train, \
                                          data_ = training_data_TL_cep,data_columns=data_columns,period_data=period_data,\
                                          features=features,update_ascii_period=update_ascii_period,\
                                          number_of_samples=ns_TL_cep,save_folder_training = '../data/GP/HC/layer3_Cepheids/training_set/')

    # Features from each class from the test set are extracted and save in a single 
    feature_file_testSet_TL_cep    = GP_feature_extraction_test_set(data_dir=data_dir_test,period_data=period_data,periods=periods,\
                                  X_testing=testing_data_TL_cep,data_columns=data_columns,\
                                  features=features,save_folder_test= '../data/GP/HC/layer3_Cepheids/test_set/')



ACEP train has (107, 2)
Cep-II train has (107, 2)
ACEP test has (46, 2)
Cep-II test has (46, 2)
The number of sample in Class 0 is 107 and is now augmented by 2 times. The augmented samples are 214
The number of sample in Class 1 is 107 and is now augmented by 2 times. The augmented samples are 214
