In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import logging


def create_logger():
    logger_ = logging.getLogger('main')
    logger_.setLevel(logging.DEBUG)
    fh = logging.FileHandler('simple_lightgbm.log')
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter('[%(levelname)s]%(asctime)s:%(name)s:%(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    # add the handlers to the logger
    logger_.addHandler(fh)
    logger_.addHandler(ch)


def get_logger():
    return logging.getLogger('main')


def lgb_multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    
    
    if len(np.unique(y_true)) == 14:
        classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
        class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
#     if len(np.unique(y_true)) > 14:
#         classes.append(99)
#         class_weight[99] = 2
    
    # Galaxy Case
    if len(np.unique(y_true)) == 5:
        classes = [6, 16, 53, 65, 92]
        class_weight = {6: 1, 16: 1, 53: 1, 65: 1, 92: 1}
        
    # Out of Galaxy Case
    if len(np.unique(y_true)) == 9:
        classes = [15, 42, 52, 62, 64, 67, 88, 90, 95]
        class_weight = {15: 2, 42: 1, 52: 1, 62: 1, 64: 2, 67: 1, 88: 1, 90: 1, 95: 1}
        
        
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')

    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False


def multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboon
    
    if len(np.unique(y_true)) == 14:
        classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
        class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
#     if len(np.unique(y_true)) > 14:
#         classes.append(99)
#         class_weight[99] = 2
    
    # Galaxy Case
    if len(np.unique(y_true)) == 5:
        classes = [6, 16, 53, 65, 92]
        class_weight = {6: 1, 16: 1, 53: 1, 65: 1, 92: 1}
        
    # Out of Galaxy Case
    if len(np.unique(y_true)) == 9:
        classes = [15, 42, 52, 62, 64, 67, 88, 90, 95]
        class_weight = {15: 2, 42: 1, 52: 1, 62: 1, 64: 2, 67: 1, 88: 1, 90: 1, 95: 1}
    
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss


def set_df(arr, col_names):
    df = pd.DataFrame(arr)
    df.columns = col_names
    return df
    

def predict_chunk(df_, clfs_, meta_, features, train_mean):
    
    print('Chunk size',df_.shape[0])
    

    full_test_in_gal, full_test_out_gal = fabriquer_feat(df_, meta_)
        


    in_classes = [6, 16, 53, 65, 92]
    out_classes = [15, 42, 52, 62, 64, 67, 88, 90, 95] 

    if full_test_in_gal.shape[0] == 0:
        in_df = pd.DataFrame(columns=in_classes)
        in_ids = []
        
    
    else :
        in_ids = full_test_in_gal['object_id'].astype(np.int64).values
        
        del full_test_in_gal['object_id']
        df_mean = full_test_in_gal.mean(axis=0)
        full_test_in_gal.fillna(df_mean, inplace=True)
    
        # Make predictions in galaxy
        preds_in_gal = None
        for clf in clfs_[0]:
            if preds_in_gal is None:
                preds_in_gal = clf.predict_proba(full_test_in_gal[features[0]]) / len(clfs_[0])
            else:
                preds_in_gal += clf.predict_proba(full_test_in_gal[features[0]]) / len(clfs_[0])
        
        in_df = set_df(preds_in_gal, in_classes)
        
            

    if full_test_out_gal.shape[0] == 0:
        out_df = pd.DataFrame(columns=out_classes)    
        out_ids = []
    
    else :
        out_ids = full_test_out_gal['object_id'].astype(np.int64).values
        
        del full_test_out_gal['object_id']
        df_mean = full_test_out_gal.mean(axis=0)
        full_test_out_gal.fillna(df_mean, inplace=True)
                
        # Make predictions out of galaxy
        preds_out_gal = None
        for clf in clfs_[1]:
            if preds_out_gal is None:
                preds_out_gal = clf.predict_proba(full_test_out_gal[features[1]]) / len(clfs_[1])
            else:
                preds_out_gal += clf.predict_proba(full_test_out_gal[features[1]]) / len(clfs_[1])
        
        out_df = set_df(preds_out_gal, out_classes)
        
    # Merge predictions
    in_out_df = pd.concat([in_df, out_df], axis=0).fillna(0)
    print(in_out_df.shape)
    
    preds_ = in_out_df.values
    
            
    # Compute preds_99 as the proba of class not being any of the others
    # preds_99 = 0.1 gives 1.769
    preds_99 = np.ones(preds_.shape[0])
    for i in range(preds_.shape[1]):
        preds_99 *= (1 - preds_[:, i])

    # Create DataFrame from predictions
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    
#     preds_df_ = pd.DataFrame(preds_, columns=['class_' + str(s) for s in clfs_[0].classes_])
    preds_df_ = pd.DataFrame(preds_, columns=['class_' + str(s) for s in classes])
    preds_df_['object_id'] = np.concatenate((in_ids,out_ids), axis=0)
    preds_df_['class_99'] = 0.14 * preds_99 / np.mean(preds_99) 

    print(preds_df_['class_99'].mean())

    del full_test_in_gal, full_test_out_gal, preds_, in_out_df
    gc.collect()

    return preds_df_


def save_importances(importances_):
    mean_gain = importances_[['gain', 'feature']].groupby('feature').mean()
    importances_['mean_gain'] = importances_['feature'].map(mean_gain['gain'])
    plt.figure(figsize=(8, 12))
    sns.barplot(x='gain', y='feature', data=importances_.sort_values('mean_gain', ascending=False))
    plt.tight_layout()
    plt.savefig('importances.png')


def train_classifiers(full_train=None, y=None):

    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    clfs = []
    importances = pd.DataFrame()
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 14,
        'metric': 'multi_logloss',
        'learning_rate': 0.03,
        'subsample': .9,
        'colsample_bytree': .7,
        'reg_alpha': .01,
        'reg_lambda': .01,
        'min_split_gain': 0.01,
        'min_child_weight': 10,
        'n_estimators': 1500,
        'silent': -1,
        'verbose': -1,
        'max_depth': 3
    }
    oof_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full_train.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=100,
            early_stopping_rounds=50
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        get_logger().info(multi_weighted_logloss(val_y, clf.predict_proba(val_x, num_iteration=clf.best_iteration_)))

        imp_df = pd.DataFrame()
        imp_df['feature'] = full_train.columns
        imp_df['gain'] = clf.feature_importances_
        imp_df['fold'] = fold_ + 1
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

        clfs.append(clf)

    get_logger().info('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))

    return clfs, importances, oof_preds




def get_new_columns(aggs):
    return [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]



    

def add_band_feats(df, db):
    
    
### 均值
    print('Adding feats for the flux mean per band...')
    stats = db.groupby(['object_id','passband'])['flux'].mean().unstack()
    stats.columns = ['band_' + str(col) + '_flux_mean' for col in stats.columns.tolist()]
    
    # band_#_flux_mean互相减去
    mean_cols = stats.columns.tolist()
    for col in mean_cols:
        subtract_cols = [col_ for col_ in mean_cols if col_ < col]
        for sub_col in subtract_cols:
            stats['{}_minus_{}'.format(col, sub_col)] = stats[col] - stats[sub_col]
      
    print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0)
    
    
### 标准差    
    print('Adding feats for the flux std per band...')
    stats = db.groupby(['object_id','passband'])['flux'].std().unstack()
    stats.columns = ['band_' + str(col) + '_flux_std' for col in stats.columns.tolist()]
    print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0)

    
### 偏度    
    print('Adding feats for the flux skew per band...')
    stats = db.groupby(['object_id','passband'])['flux'].skew().unstack()
    stats.columns = ['band_' + str(col) + '_flux_skew' for col in stats.columns.tolist()]
    print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0) 
    

### 最大值
    print('Adding feats for the flux max per band...')
    stats = db.groupby(['object_id','passband'])['flux'].max().unstack()
    stats.columns = ['band_' + str(col) + '_flux_max' for col in stats.columns.tolist()]
    # band_#_flux_max互相减去
    max_cols = stats.columns.tolist()
    for col in max_cols:
        subtract_cols = [col_ for col_ in max_cols if col_ < col]
        for sub_col in subtract_cols:
            stats['{}_minus_{}'.format(col, sub_col)] = stats[col] - stats[sub_col]
            
    print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0) 
    

    
### 最小值    
    print('Adding feats for the flux min per band...')
    stats = db.groupby(['object_id','passband'])['flux'].min().unstack()
    stats.columns = ['band_' + str(col) + '_flux_min' for col in stats.columns.tolist()]
    # band_#_flux_min互相做差
    min_cols = stats.columns.tolist()
    for col in min_cols:
        subtract_cols = [col_ for col_ in min_cols if col_ < col]
        for sub_col in subtract_cols:
            stats['{}_minus_{}'.format(col, sub_col)] = stats[col] - stats[sub_col]
    print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0) 
    

    
    
# ### ......  Mean
#     print('Adding feats for the flux_err mean per band...')
#     stats = db.groupby(['object_id','passband'])['flux_err'].mean().unstack()
#     stats.columns = ['band_' + str(col) + '_flux_err_mean' for col in stats.columns.tolist()]      
#     print('Feats added:',stats.columns.tolist())
#     stats['object_id'] = stats.index    
#     df = df.merge(stats, on='object_id', how='left').fillna(0)
    
    
    
    
### 遍历band_list计算 
    print('Adding feats for the flux (max-min)/mean per band...')
    for band_n in range(6):
        df['band_' + str(band_n) + '_flux_diff1'] = df['band_' + str(band_n) + '_flux_max'] - df['band_' + str(band_n) + '_flux_min']
        df['band_' + str(band_n) + '_flux_diff2'] = df['band_' + str(band_n) + '_flux_diff1']/df['band_' + str(band_n) + '_flux_mean']
        print('Feature added: band_' + str(band_n) + '_flux_diff2')
        
#         df['band_' + str(band_n) + '_flux_err_ratio'] = df['band_' + str(band_n) + '_flux_err_mean']/df['band_' + str(band_n) + '_flux_mean']
#         print('Feature added: band_' + str(band_n) + '_flux_err_ratio')

    return df
    


def add_feats_within_time_interval_out(int_n, df, db):
    print('Number of Intervals :', int_n)
    t_min = db.mjd.min()
    t_max = db.mjd.max()
    print('Min and Max MJD time : {}, {}'.format(t_min, t_max))    
    int_dur = (t_max - t_min)/int_n
    for i in range(int_n):
        
        db_fil = db[(db.mjd>=(t_min+i*int_dur))&(db.mjd<(t_min + (i+1)*int_dur))][['object_id','flux','passband']]
        print('Interval #{}, record quantity: {}'.format(i+1, db_fil.shape[0]))
        
        # interval_#_flux_？
        stats = db_fil.groupby('object_id', as_index=False)['flux'].agg({'interval_{}_flux_mean'.format(i+1):'mean',
#                                                                           'interval_{}_flux_std'.format(i+1):'std',
                                                                          'interval_{}_flux_min'.format(i+1):'min',
                                                                          'interval_{}_flux_max'.format(i+1):'max',
#                                                                          'interval_{}_flux_skew'.format(i+1):'skew'
                                                                        })
        print('New features added: ',stats.columns.tolist())
        df = df.merge(stats, on='object_id', how='left')
        
        
#         # interval_#_band_#_flux_？
#         stats = db_fil.groupby(['object_id','passband'])['flux'].skew().unstack()
#         stats.columns = ['interval_{}_band_{}_flux_skew'.format(i+1, str(col)) for col in stats.columns.tolist()]
#         print('Feats added:',stats.columns.tolist())
#         stats['object_id'] = stats.index    
#         df = df.merge(stats, on='object_id', how='left').fillna(0) 
        
                
#     # interval_#_flux_？ 互相做差
#     for key in ['max', 'min', 'mean']:
# #     for key in ['mean']:
#         key_cols = ['interval_{}_flux_{}'.format(i, key) for i in range(1, int_n+1)]
#         for col in key_cols:
#             subtract_cols = [col_ for col_ in key_cols if col_ < col]
#             for sub_col in subtract_cols:
#                 df['{}_minus_{}'.format(col, sub_col)] = df[col] - df[sub_col]
#                 print('Feature added:', '{}_minus_{}'.format(col, sub_col))
        

    
    print('Dimension of data after adding features relevant to time intervals', df.shape)
    
    return df
    
    
def add_feats_within_time_interval(int_n, df, db):
    print('Number of Intervals :', int_n)
    t_min = db.mjd.min()
    t_max = db.mjd.max()
    print('Min and Max MJD time : {}, {}'.format(t_min, t_max))    
    int_dur = (t_max - t_min)/int_n
    for i in range(int_n):
        
        db_fil = db[(db.mjd>=(t_min+i*int_dur))&(db.mjd<(t_min + (i+1)*int_dur))][['object_id','flux','passband']]
        print('Interval #{}, record quantity: {}'.format(i+1, db_fil.shape[0]))
        
        # interval_#_flux_？
        stats = db_fil.groupby('object_id', as_index=False)['flux'].agg({'interval_{}_flux_mean'.format(i+1):'mean',
                                                                          'interval_{}_flux_std'.format(i+1):'std',
                                                                          'interval_{}_flux_min'.format(i+1):'min',
                                                                          'interval_{}_flux_max'.format(i+1):'max',
                                                                         'interval_{}_flux_skew'.format(i+1):'skew'})
        print('New features added: ',stats.columns.tolist())
        df = df.merge(stats, on='object_id', how='left')
        
        
#         # interval_#_band_#_flux_？
#         stats = db_fil.groupby(['object_id','passband'])['flux'].skew().unstack()
#         stats.columns = ['interval_{}_band_{}_flux_skew'.format(i+1, str(col)) for col in stats.columns.tolist()]
#         print('Feats added:',stats.columns.tolist())
#         stats['object_id'] = stats.index    
#         df = df.merge(stats, on='object_id', how='left').fillna(0) 
        
                
    # interval_#_flux_？ 互相做差
    for key in ['max', 'min', 'mean']:
#     for key in ['max']:
        key_cols = ['interval_{}_flux_{}'.format(i, key) for i in range(1, int_n+1)]
        for col in key_cols:
            subtract_cols = [col_ for col_ in key_cols if col_ < col]
            for sub_col in subtract_cols:
                df['{}_minus_{}'.format(col, sub_col)] = df[col] - df[sub_col]
                print('Feature added:', '{}_minus_{}'.format(col, sub_col))
        

    
    print('Dimension of data after adding features relevant to time intervals', df.shape)
    
    return df
    

def agg_by_flux_feats(df):
    
    df['flux_ratio'] = df['flux'] / df['flux_err']
    
    df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)
    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']
    
    aggs = {
#         'mjd': ['min', 'max', 'size'],
#         'passband': ['mean', 'std', 'var'],  
        'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'flux_err': ['min', 'max', 'mean', 'median', 'std'],
        'flux_ratio': ['min', 'max', 'mean', 'std'],
        'detected': ['mean'],  # ''min', 'max', 'mean', 'median', 'std'],
    }   

#     aggs['flux_ratio_sq'] = ['sum']
#     aggs['flux_by_flux_ratio_sq'] = ['sum']

    
    agg_df = df.groupby('object_id').agg(aggs)
    new_columns = get_new_columns(aggs)
    agg_df.columns = new_columns

    agg_df = add_flux_second_order_features_to_agg(df=agg_df)
    
    return agg_df
    

def get_by_galaxy(df):
    df_in_gal = df[df['in_galaxy']==1]
    objects_in_gal = df_in_gal['object_id'].unique().tolist()
    print('Number of objects in galaxy :',len(objects_in_gal))
    
    df_out_gal = df[df['in_galaxy']==0]
    objects_out_gal = df_out_gal['object_id'].unique().tolist()
    print('Number of objects out of galaxy :',len(objects_out_gal))
    print('Just to check, sum of objects :', len(objects_in_gal) + len(objects_out_gal))
    print('Total number should be', len(df['object_id'].unique().tolist()))
    
    return df_in_gal, df_out_gal
    
def add_photo_feats(df):
    df['hostgal_photoz_ratio'] = df['hostgal_photoz']/df['hostgal_photoz_err']
    
    return df
    
    
def fabriquer_feat(db, meta):
    

    
    # META数据提供划分银河系内外的依据
    # 增加是否属于银河系的特征
    meta.distmod.fillna(0,inplace=True)
    meta['in_galaxy'] = 0
    meta.loc[(meta.distmod == 0), 'in_galaxy'] = 1
    
    # 时序数据和META数据融合，形成以mjd为行的数据
    db_meta = db.merge(meta, on='object_id', how='left')
    print('Dimension of merge data for MJD relevant data and META data ', db_meta.shape)
    
    # 对时序融合数据进行分割
    db_in_gal, db_out_gal = get_by_galaxy(db_meta)
    print('Dimension of merge data for that in galaxy and that out of galaxy ', db_in_gal.shape, db_out_gal.shape)
    
    # 基本特征聚合
    agg_df = agg_by_flux_feats(db)
    print('Dimension of aggregated data on flux features', agg_df.shape)
    
    # 聚合数据和META数据融合，形成以object_id为行的数据
    agg_df_meta = agg_df.merge(meta, on='object_id', how='left')
    print('Dimension of merge data for Object relevant data and META data', agg_df_meta.shape)

    # 对object融合数据进行分割
    df_in_gal, df_out_gal = get_by_galaxy(agg_df_meta)
    print('Dimension of merge data for that in galaxy and that out of galaxy ', df_in_gal.shape, df_out_gal.shape)

    # 对银河系内外数据分别提取特征
    print('Features extraction begins...')
    
    # 特征提取前先校验数据行数
    if df_in_gal.shape[0] == 0:
        print('Object relevant data in the Galaxy has no data, nothing to predict.')
    
    else :
        # 银河系内
        print('In terms of that in the Galaxy...')
        
        # 增加band相关特征
        df_in_gal = add_band_feats(df_in_gal, db_in_gal)    
        
        # 增加按MJD划分时间统计得到的特征
        df_in_gal = add_feats_within_time_interval(6, df_in_gal, db_in_gal)        
    
    if df_out_gal.shape[0] == 0:
        print('Object relevant data out of the Galaxy has no data, nothing to predict.')    
    
    else :       
        # 银河系外
        print('In terms of that out of the Galaxy...')
        
        # 增加hostgal_photoz相关特征
        df_out_gal = add_photo_feats(df_out_gal)
        
        # 增加band相关特征
        df_out_gal = add_band_feats(df_out_gal, db_out_gal)    
        
        # 增加按MJD划分时间统计得到的特征
        df_out_gal = add_feats_within_time_interval_out(6, df_out_gal, db_out_gal) 
    
    return df_in_gal, df_out_gal
    
    

def add_flux_second_order_features_to_agg(df):
#     df['mjd_diff'] = df['mjd_max'] - df['mjd_min']
    df['flux_diff'] = df['flux_max'] - df['flux_min']
    df['flux_dif2'] = (df['flux_max'] - df['flux_min']) / df['flux_mean']
#     df['flux_w_mean'] = df['flux_by_flux_ratio_sq_sum'] / df['flux_ratio_sq_sum']
#     df['flux_dif3'] = (df['flux_max'] - df['flux_min']) / df['flux_w_mean']

#     del df['mjd_max'], df['mjd_min']

    return df
    
    

def main():

    train = pd.read_csv('../input/training_set.csv')

    meta_train = pd.read_csv('../input/training_set_metadata.csv')
    # 去除无效特征
    del meta_train['hostgal_specz']
    print('Feature hostgal_specz is removed')
    
    
    full_train_in_gal, full_train_out_gal = fabriquer_feat(train, meta_train)
    
    del train
    
    print('Training begins...')
    

    y_list = []
    preds_list = []
    clf_list = []
    for df in [full_train_in_gal, full_train_out_gal]:
        del df['object_id']
        df_mean = df.mean(axis=0)
        df.fillna(df_mean, inplace=True)
        y = df['target']
        y_list.append(y)
        del df['target']
        gc.collect()
        get_logger().info(df.columns)
    
        clfs, importances, preds = train_classifiers(df, y)
        save_importances(importances_=importances)
        preds_list.append(preds)
        clf_list.append(clfs)
    
        
    
        

    
    
    
    
    
    
    
    
    
    
    meta_test = pd.read_csv('../input/test_set_metadata.csv')
    # 去除无效特征
    del meta_test['hostgal_specz']
    print('Feature hostgal_specz is removed')
    
    import time
    
    start = time.time()
#     chunks = 5000000
    chunks = 500000
    remain_df = None
    
    for i_c, df in enumerate(pd.read_csv('../input/test_set_sample.csv', chunksize=chunks, iterator=True)):
        # Check object_ids
        # I believe np.unique keeps the order of group_ids as they appear in the file
        unique_ids = np.unique(df['object_id'])
        # 最后一个ID的内容
        new_remain_df = df.loc[df['object_id'] == unique_ids[-1]].copy()
    
        if remain_df is None:
            #  除最后一个ID外的内容
            df = df.loc[df['object_id'].isin(unique_ids[:-1])].copy()
        else:
            df = pd.concat([remain_df, df.loc[df['object_id'].isin(unique_ids[:-1])]], axis=0)
    
        # Create remaining samples df
        remain_df = new_remain_df
        
        
    
        preds_df = predict_chunk(df_=df,
                                 clfs_=clf_list,
                                 meta_=meta_test,
                                 features=[full_train_in_gal.columns, full_train_out_gal.columns],
                                 train_mean=None)
    
        if i_c == 0:
            preds_df.to_csv('predictions_v3.csv', header=True, index=False, float_format='%.6f')
        else:
            preds_df.to_csv('predictions_v3.csv', header=False, mode='a', index=False, float_format='%.6f')
    
        del preds_df
        gc.collect()
    
        if (i_c + 1) % 5 == 0:
            # get_logger().info('%15d done in %5.1f' % (chunks * (i_c + 1), (time.time() - start) / 60))
            # print('%15d done in %5.1f' % (chunks * (i_c + 1), (time.time() - start) / 60))
            get_logger().info('%15d done in %5.1f' % (chunks * (i_c + 1), (time.time() - start) / 60))
            get_logger().info('Progress percentage : %5.2f' % (chunks * (i_c + 1)/500000000))
            get_logger().info('Time estimated left : %5.2f' % ((time.time() - start) / 60 * (500000000-chunks * (i_c + 1))/(chunks * (i_c + 1))))
            print('%15d done in %5.1f' % (chunks * (i_c + 1), (time.time() - start) / 60))
            print('Progress percentage : %5.2f' % (chunks * (i_c + 1)/500000000))
            print('Time estimated left : %5.2f' % ((time.time() - start) / 60 * (500000000-chunks * (i_c + 1))/(chunks * (i_c + 1))))
    
    # Compute last object in remain_df
    
    preds_df = predict_chunk(df_=remain_df,
                             clfs_=clf_list,
                             meta_=meta_test,
                             features=[full_train_in_gal.columns, full_train_out_gal.columns],
                             train_mean=None)
    
    preds_df.to_csv('predictions_v3.csv', header=False, mode='a', index=False, float_format='%.6f')
    
    z = pd.read_csv('predictions_v3.csv')
    
    z = z.groupby('object_id').mean()
    
    z.to_csv('single_predictions_v3.csv', index=True, float_format='%.6f')
    
    z = z.astype(np.float32)
    
    z['object_id'] = z.index.astype(np.int32)
    
    z = z.drop_duplicates(subset=['object_id'], keep='first')

    z.to_csv('single_predictions_v3.gz', index=False, float_format='%.6f', compression='gzip')



    
    


if __name__ == '__main__':
    gc.enable()
    create_logger()
    try:
        main()
    except Exception:
        get_logger().exception('Unexpected Exception Occured')
        raise


Feature hostgal_specz is removed
Dimension of merge data for MJD relevant data and META data  (1421705, 17)
Number of objects in galaxy : 2325
Number of objects out of galaxy : 5523
Just to check, sum of objects : 7848
Total number should be 7848
Dimension of merge data for that in galaxy and that out of galaxy  (400574, 17) (1021131, 17)
Dimension of aggregated data on flux features (7848, 18)
Dimension of merge data for Object relevant data and META data (7848, 30)
Number of objects in galaxy : 2325
Number of objects out of galaxy : 5523
Just to check, sum of objects : 7848
Total number should be 7848
Dimension of merge data for that in galaxy and that out of galaxy  (2325, 30) (5523, 30)
Features extraction begins...
In terms of that in the Galaxy...
Adding feats for the flux mean per band...
Feats added: ['band_0_flux_mean', 'band_1_flux_mean', 'band_2_flux_mean', 'band_3_flux_mean', 'band_4_flux_mean', 'band_5_flux_mean', 'band_1_flux_mean_minus_band_0_flux_mean', 'band_2_flux_mea

Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version


Feats added: ['band_0_flux_skew', 'band_1_flux_skew', 'band_2_flux_skew', 'band_3_flux_skew', 'band_4_flux_skew', 'band_5_flux_skew']
Adding feats for the flux max per band...
Feats added: ['band_0_flux_max', 'band_1_flux_max', 'band_2_flux_max', 'band_3_flux_max', 'band_4_flux_max', 'band_5_flux_max', 'band_1_flux_max_minus_band_0_flux_max', 'band_2_flux_max_minus_band_0_flux_max', 'band_2_flux_max_minus_band_1_flux_max', 'band_3_flux_max_minus_band_0_flux_max', 'band_3_flux_max_minus_band_1_flux_max', 'band_3_flux_max_minus_band_2_flux_max', 'band_4_flux_max_minus_band_0_flux_max', 'band_4_flux_max_minus_band_1_flux_max', 'band_4_flux_max_minus_band_2_flux_max', 'band_4_flux_max_minus_band_3_flux_max', 'band_5_flux_max_minus_band_0_flux_max', 'band_5_flux_max_minus_band_1_flux_max', 'band_5_flux_max_minus_band_2_flux_max', 'band_5_flux_max_minus_band_3_flux_max', 'band_5_flux_max_minus_band_4_flux_max']
Adding feats for the flux min per band...
Feats added: ['band_0_flux_min', 'band_

Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version


New features added:  ['object_id', 'interval_1_flux_mean', 'interval_1_flux_std', 'interval_1_flux_min', 'interval_1_flux_max', 'interval_1_flux_skew']
Interval #2, record quantity: 82096
New features added:  ['object_id', 'interval_2_flux_mean', 'interval_2_flux_std', 'interval_2_flux_min', 'interval_2_flux_max', 'interval_2_flux_skew']
Interval #3, record quantity: 61348
New features added:  ['object_id', 'interval_3_flux_mean', 'interval_3_flux_std', 'interval_3_flux_min', 'interval_3_flux_max', 'interval_3_flux_skew']
Interval #4, record quantity: 73604
New features added:  ['object_id', 'interval_4_flux_mean', 'interval_4_flux_std', 'interval_4_flux_min', 'interval_4_flux_max', 'interval_4_flux_skew']
Interval #5, record quantity: 56390
New features added:  ['object_id', 'interval_5_flux_mean', 'interval_5_flux_std', 'interval_5_flux_min', 'interval_5_flux_max', 'interval_5_flux_skew']
Interval #6, record quantity: 88858
New features added:  ['object_id', 'interval_6_flux_mean', '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Feats added: ['band_0_flux_std', 'band_1_flux_std', 'band_2_flux_std', 'band_3_flux_std', 'band_4_flux_std', 'band_5_flux_std']
Adding feats for the flux skew per band...
Feats added: ['band_0_flux_skew', 'band_1_flux_skew', 'band_2_flux_skew', 'band_3_flux_skew', 'band_4_flux_skew', 'band_5_flux_skew']
Adding feats for the flux max per band...
Feats added: ['band_0_flux_max', 'band_1_flux_max', 'band_2_flux_max', 'band_3_flux_max', 'band_4_flux_max', 'band_5_flux_max', 'band_1_flux_max_minus_band_0_flux_max', 'band_2_flux_max_minus_band_0_flux_max', 'band_2_flux_max_minus_band_1_flux_max', 'band_3_flux_max_minus_band_0_flux_max', 'band_3_flux_max_minus_band_1_flux_max', 'band_3_flux_max_minus_band_2_flux_max', 'band_4_flux_max_minus_band_0_flux_max', 'band_4_flux_max_minus_band_1_flux_max', 'band_4_flux_max_minus_band_2_flux_max', 'band_4_flux_max_minus_band_3_flux_max', 'band_5_flux_max_minus_band_0_flux_max', 'band_5_flux_max_minus_band_1_flux_max', 'band_5_flux_max_minus_band_2_flu

[INFO]2018-10-30 11:05:52,464:main:Index(['flux_min', 'flux_max', 'flux_mean', 'flux_median', 'flux_std',
       'flux_skew', 'flux_err_min', 'flux_err_max', 'flux_err_mean',
       'flux_err_median',
       ...
       'interval_4_flux_mean_minus_interval_3_flux_mean',
       'interval_5_flux_mean_minus_interval_1_flux_mean',
       'interval_5_flux_mean_minus_interval_2_flux_mean',
       'interval_5_flux_mean_minus_interval_3_flux_mean',
       'interval_5_flux_mean_minus_interval_4_flux_mean',
       'interval_6_flux_mean_minus_interval_1_flux_mean',
       'interval_6_flux_mean_minus_interval_2_flux_mean',
       'interval_6_flux_mean_minus_interval_3_flux_mean',
       'interval_6_flux_mean_minus_interval_4_flux_mean',
       'interval_6_flux_mean_minus_interval_5_flux_mean'],
      dtype='object', length=190)


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.169033	training's wloss: 0.301752	valid_1's multi_logloss: 0.202966	valid_1's wloss: 0.348914
[200]	training's multi_logloss: 0.0541496	training's wloss: 0.122061	valid_1's multi_logloss: 0.100286	valid_1's wloss: 0.202674
[300]	training's multi_logloss: 0.0307363	training's wloss: 0.0756457	valid_1's multi_logloss: 0.0827752	valid_1's wloss: 0.173819
[400]	training's multi_logloss: 0.0229738	training's wloss: 0.0585223	valid_1's multi_logloss: 0.0762208	valid_1's wloss: 0.163894
[500]	training's multi_logloss: 0.0194245	training's wloss: 0.050665	valid_1's multi_logloss: 0.0734935	valid_1's wloss: 0.159746
[600]	training's multi_logloss: 0.017266	training's wloss: 0.0460914	valid_1's multi_logloss: 0.0718967	valid_1's wloss: 0.157539
[700]	training's multi_logloss: 0.0160478	training's wloss: 0.043496	valid_1's multi_logloss: 0.0707719	valid_1's wloss: 0.155067
[800]	training's multi_loglos

[INFO]2018-10-30 11:05:59,989:main:0.1499498723033605


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.163047	training's wloss: 0.276593	valid_1's multi_logloss: 0.236771	valid_1's wloss: 0.595959
[200]	training's multi_logloss: 0.0494347	training's wloss: 0.10914	valid_1's multi_logloss: 0.140613	valid_1's wloss: 0.501168
[300]	training's multi_logloss: 0.0284263	training's wloss: 0.069684	valid_1's multi_logloss: 0.127117	valid_1's wloss: 0.470301
[400]	training's multi_logloss: 0.021736	training's wloss: 0.055813	valid_1's multi_logloss: 0.12395	valid_1's wloss: 0.456676
[500]	training's multi_logloss: 0.0184705	training's wloss: 0.0485095	valid_1's multi_logloss: 0.122793	valid_1's wloss: 0.45191
[600]	training's multi_logloss: 0.0168229	training's wloss: 0.0445997	valid_1's multi_logloss: 0.122189	valid_1's wloss: 0.449461
[700]	training's multi_logloss: 0.0156467	training's wloss: 0.0418909	valid_1's multi_logloss: 0.121268	valid_1's wloss: 0.445549
[800]	training's multi_logloss: 0.014

[INFO]2018-10-30 11:06:06,287:main:0.44203528244246587


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.16921	training's wloss: 0.298611	valid_1's multi_logloss: 0.225788	valid_1's wloss: 0.394495
[200]	training's multi_logloss: 0.0539567	training's wloss: 0.119552	valid_1's multi_logloss: 0.118112	valid_1's wloss: 0.257296
[300]	training's multi_logloss: 0.0304425	training's wloss: 0.0720905	valid_1's multi_logloss: 0.0964898	valid_1's wloss: 0.224019
[400]	training's multi_logloss: 0.022831	training's wloss: 0.0560229	valid_1's multi_logloss: 0.0884485	valid_1's wloss: 0.209887
[500]	training's multi_logloss: 0.0192832	training's wloss: 0.0488656	valid_1's multi_logloss: 0.0850786	valid_1's wloss: 0.207034
[600]	training's multi_logloss: 0.017261	training's wloss: 0.0448409	valid_1's multi_logloss: 0.0832601	valid_1's wloss: 0.205641
[700]	training's multi_logloss: 0.0160516	training's wloss: 0.0421513	valid_1's multi_logloss: 0.0819943	valid_1's wloss: 0.203856
[800]	training's multi_loglos

[INFO]2018-10-30 11:06:16,647:main:0.1972541285358784


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.172261	training's wloss: 0.304318	valid_1's multi_logloss: 0.20288	valid_1's wloss: 0.417031
[200]	training's multi_logloss: 0.0548965	training's wloss: 0.117153	valid_1's multi_logloss: 0.0921482	valid_1's wloss: 0.273234
[300]	training's multi_logloss: 0.0318718	training's wloss: 0.0707637	valid_1's multi_logloss: 0.0716456	valid_1's wloss: 0.243382
[400]	training's multi_logloss: 0.0236814	training's wloss: 0.0544754	valid_1's multi_logloss: 0.0648693	valid_1's wloss: 0.233342
[500]	training's multi_logloss: 0.0199544	training's wloss: 0.0474491	valid_1's multi_logloss: 0.0613999	valid_1's wloss: 0.229562
[600]	training's multi_logloss: 0.0178056	training's wloss: 0.0436429	valid_1's multi_logloss: 0.0594178	valid_1's wloss: 0.22784
[700]	training's multi_logloss: 0.0165051	training's wloss: 0.0408568	valid_1's multi_logloss: 0.0583852	valid_1's wloss: 0.226314
Early stopping, best iterat

[INFO]2018-10-30 11:06:23,059:main:0.22600977501542485


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.167726	training's wloss: 0.293084	valid_1's multi_logloss: 0.228952	valid_1's wloss: 0.435358
[200]	training's multi_logloss: 0.0515094	training's wloss: 0.114783	valid_1's multi_logloss: 0.126558	valid_1's wloss: 0.316019
[300]	training's multi_logloss: 0.0288055	training's wloss: 0.071199	valid_1's multi_logloss: 0.110774	valid_1's wloss: 0.291484
[400]	training's multi_logloss: 0.0216337	training's wloss: 0.0559576	valid_1's multi_logloss: 0.105288	valid_1's wloss: 0.287763
[500]	training's multi_logloss: 0.0185206	training's wloss: 0.0490552	valid_1's multi_logloss: 0.103088	valid_1's wloss: 0.284832
[600]	training's multi_logloss: 0.0169532	training's wloss: 0.0455655	valid_1's multi_logloss: 0.101445	valid_1's wloss: 0.282328
[700]	training's multi_logloss: 0.0157822	training's wloss: 0.0428876	valid_1's multi_logloss: 0.100523	valid_1's wloss: 0.281044
[800]	training's multi_logloss: 

[INFO]2018-10-30 11:06:30,098:main:0.28062322449438176
[INFO]2018-10-30 11:06:30,108:main:MULTI WEIGHTED LOG LOSS : 0.25886 
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


MULTI WEIGHTED LOG LOSS : 0.25886 


[INFO]2018-10-30 11:06:36,776:main:Index(['flux_min', 'flux_max', 'flux_mean', 'flux_median', 'flux_std',
       'flux_skew', 'flux_err_min', 'flux_err_max', 'flux_err_mean',
       'flux_err_median',
       ...
       'interval_3_flux_max', 'interval_4_flux_mean', 'interval_4_flux_min',
       'interval_4_flux_max', 'interval_5_flux_mean', 'interval_5_flux_min',
       'interval_5_flux_max', 'interval_6_flux_mean', 'interval_6_flux_min',
       'interval_6_flux_max'],
      dtype='object', length=134)


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.917817	training's wloss: 1.14406	valid_1's multi_logloss: 1.033	valid_1's wloss: 1.41066
[200]	training's multi_logloss: 0.722588	training's wloss: 0.910475	valid_1's multi_logloss: 0.904551	valid_1's wloss: 1.31387
[300]	training's multi_logloss: 0.624858	training's wloss: 0.766334	valid_1's multi_logloss: 0.866057	valid_1's wloss: 1.26616
[400]	training's multi_logloss: 0.551629	training's wloss: 0.65004	valid_1's multi_logloss: 0.847394	valid_1's wloss: 1.24643
[500]	training's multi_logloss: 0.492913	training's wloss: 0.561163	valid_1's multi_logloss: 0.836207	valid_1's wloss: 1.23775
[600]	training's multi_logloss: 0.443096	training's wloss: 0.489882	valid_1's multi_logloss: 0.830026	valid_1's wloss: 1.23795
Early stopping, best iteration is:
[550]	training's multi_logloss: 0.466955	training's wloss: 0.523276	valid_1's multi_logloss: 0.832682	valid_1's wloss: 1.23586


[INFO]2018-10-30 11:06:49,377:main:1.2358575637252864


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.918311	training's wloss: 1.15155	valid_1's multi_logloss: 1.01533	valid_1's wloss: 1.31413
[200]	training's multi_logloss: 0.718478	training's wloss: 0.902559	valid_1's multi_logloss: 0.886493	valid_1's wloss: 1.20885
[300]	training's multi_logloss: 0.61689	training's wloss: 0.743767	valid_1's multi_logloss: 0.849332	valid_1's wloss: 1.18164
Early stopping, best iteration is:
[349]	training's multi_logloss: 0.578592	training's wloss: 0.683561	valid_1's multi_logloss: 0.839556	valid_1's wloss: 1.17803


[INFO]2018-10-30 11:06:58,372:main:1.178028945457796


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.909633	training's wloss: 1.14683	valid_1's multi_logloss: 1.04151	valid_1's wloss: 1.33722
[200]	training's multi_logloss: 0.707384	training's wloss: 0.899418	valid_1's multi_logloss: 0.92188	valid_1's wloss: 1.23414
[300]	training's multi_logloss: 0.605347	training's wloss: 0.737562	valid_1's multi_logloss: 0.890519	valid_1's wloss: 1.19966
[400]	training's multi_logloss: 0.532527	training's wloss: 0.620107	valid_1's multi_logloss: 0.877816	valid_1's wloss: 1.19406
Early stopping, best iteration is:
[362]	training's multi_logloss: 0.557966	training's wloss: 0.66049	valid_1's multi_logloss: 0.881754	valid_1's wloss: 1.19321


[INFO]2018-10-30 11:07:07,710:main:1.193212058918227


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.917181	training's wloss: 1.14569	valid_1's multi_logloss: 1.01141	valid_1's wloss: 1.34203
[200]	training's multi_logloss: 0.720441	training's wloss: 0.901372	valid_1's multi_logloss: 0.88418	valid_1's wloss: 1.23935
[300]	training's multi_logloss: 0.618191	training's wloss: 0.744526	valid_1's multi_logloss: 0.841344	valid_1's wloss: 1.19938
[400]	training's multi_logloss: 0.544169	training's wloss: 0.632313	valid_1's multi_logloss: 0.82082	valid_1's wloss: 1.1852
[500]	training's multi_logloss: 0.48545	training's wloss: 0.54609	valid_1's multi_logloss: 0.811667	valid_1's wloss: 1.17922
Early stopping, best iteration is:
[479]	training's multi_logloss: 0.49665	training's wloss: 0.562252	valid_1's multi_logloss: 0.812823	valid_1's wloss: 1.17827


[INFO]2018-10-30 11:07:19,463:main:1.1782744783079875


Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.920307	training's wloss: 1.15323	valid_1's multi_logloss: 1.01103	valid_1's wloss: 1.3544
[200]	training's multi_logloss: 0.72458	training's wloss: 0.914955	valid_1's multi_logloss: 0.882937	valid_1's wloss: 1.24197
[300]	training's multi_logloss: 0.623648	training's wloss: 0.754206	valid_1's multi_logloss: 0.842624	valid_1's wloss: 1.19038
[400]	training's multi_logloss: 0.549989	training's wloss: 0.636779	valid_1's multi_logloss: 0.824321	valid_1's wloss: 1.1719
[500]	training's multi_logloss: 0.49327	training's wloss: 0.551999	valid_1's multi_logloss: 0.816096	valid_1's wloss: 1.16691
Early stopping, best iteration is:
[462]	training's multi_logloss: 0.513579	training's wloss: 0.582307	valid_1's multi_logloss: 0.817941	valid_1's wloss: 1.16668


[INFO]2018-10-30 11:07:30,747:main:1.1666816449932116
[INFO]2018-10-30 11:07:30,757:main:MULTI WEIGHTED LOG LOSS : 1.19084 
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


MULTI WEIGHTED LOG LOSS : 1.19084 
Feature hostgal_specz is removed
Chunk size 499895
Dimension of merge data for MJD relevant data and META data  (499895, 16)
Number of objects in galaxy : 13
Number of objects out of galaxy : 1504
Just to check, sum of objects : 1517
Total number should be 1517
Dimension of merge data for that in galaxy and that out of galaxy  (4352, 16) (495543, 16)
Dimension of aggregated data on flux features (1517, 18)
Dimension of merge data for Object relevant data and META data (1517, 29)
Number of objects in galaxy : 13
Number of objects out of galaxy : 1504
Just to check, sum of objects : 1517
Total number should be 1517
Dimension of merge data for that in galaxy and that out of galaxy  (13, 29) (1504, 29)
Features extraction begins...
In terms of that in the Galaxy...
Adding feats for the flux mean per band...
Feats added: ['band_0_flux_mean', 'band_1_flux_mean', 'band_2_flux_mean', 'band_3_flux_mean', 'band_4_flux_mean', 'band_5_flux_mean', 'band_1_flux_mea

Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 interval_2_flux_max_minus_interval_1_flux_max
Feature added: interval_3_flux_max_minus_interval_1_flux_max
Feature added: interval_3_flux_max_minus_interval_2_flux_max
Feature added: interval_4_flux_max_minus_interval_1_flux_max
Feature added: interval_4_flux_max_minus_interval_2_flux_max
Feature added: interval_4_flux_max_minus_interval_3_flux_max
Feature added: interval_5_flux_max_minus_interval_1_flux_max
Feature added: interval_5_flux_max_minus_interval_2_flux_max
Feature added: interval_5_flux_max_minus_interval_3_flux_max
Feature added: interval_5_flux_max_minus_interval_4_flux_max
Feature added: interval_6_flux_max_minus_interval_1_flux_max
Feature added: interval_6_flux_max_minus_interval_2_flux_max
Feature added: interval_6_flux_max_minus_interval_3_flux_max
Feature added: interval_6_flux_max_minus_interval_4_flux_max
Feature added: interval_6_flux_max_minus_interval_5_flux_max
Feature added: interval_2_flux_min_minus_interval_1_flux_min
Feature added: interval_3_flux_min_min

Chunk size 499845
Dimension of merge data for MJD relevant data and META data  (499845, 16)
Number of objects in galaxy : 4
Number of objects out of galaxy : 1514
Just to check, sum of objects : 1518
Total number should be 1518
Dimension of merge data for that in galaxy and that out of galaxy  (1362, 16) (498483, 16)
Dimension of aggregated data on flux features (1518, 18)
Dimension of merge data for Object relevant data and META data (1518, 29)
Number of objects in galaxy : 4
Number of objects out of galaxy : 1514
Just to check, sum of objects : 1518
Total number should be 1518
Dimension of merge data for that in galaxy and that out of galaxy  (4, 29) (1514, 29)
Features extraction begins...
In terms of that in the Galaxy...
Adding feats for the flux mean per band...
Feats added: ['band_0_flux_mean', 'band_1_flux_mean', 'band_2_flux_mean', 'band_3_flux_mean', 'band_4_flux_mean', 'band_5_flux_mean', 'band_1_flux_mean_minus_band_0_flux_mean', 'band_2_flux_mean_minus_band_0_flux_mean', '

Feats added: ['band_0_flux_skew', 'band_1_flux_skew', 'band_2_flux_skew', 'band_3_flux_skew', 'band_4_flux_skew', 'band_5_flux_skew']
Adding feats for the flux max per band...
Feats added: ['band_0_flux_max', 'band_1_flux_max', 'band_2_flux_max', 'band_3_flux_max', 'band_4_flux_max', 'band_5_flux_max', 'band_1_flux_max_minus_band_0_flux_max', 'band_2_flux_max_minus_band_0_flux_max', 'band_2_flux_max_minus_band_1_flux_max', 'band_3_flux_max_minus_band_0_flux_max', 'band_3_flux_max_minus_band_1_flux_max', 'band_3_flux_max_minus_band_2_flux_max', 'band_4_flux_max_minus_band_0_flux_max', 'band_4_flux_max_minus_band_1_flux_max', 'band_4_flux_max_minus_band_2_flux_max', 'band_4_flux_max_minus_band_3_flux_max', 'band_5_flux_max_minus_band_0_flux_max', 'band_5_flux_max_minus_band_1_flux_max', 'band_5_flux_max_minus_band_2_flux_max', 'band_5_flux_max_minus_band_3_flux_max', 'band_5_flux_max_minus_band_4_flux_max']
Adding feats for the flux min per band...
Feats added: ['band_0_flux_min', 'band_

Dimension of merge data for MJD relevant data and META data  (260, 16)
Number of objects in galaxy : 0
Number of objects out of galaxy : 1
Just to check, sum of objects : 1
Total number should be 1
Dimension of merge data for that in galaxy and that out of galaxy  (0, 16) (260, 16)
Dimension of aggregated data on flux features (1, 18)
Dimension of merge data for Object relevant data and META data (1, 29)
Number of objects in galaxy : 0
Number of objects out of galaxy : 1
Just to check, sum of objects : 1
Total number should be 1
Dimension of merge data for that in galaxy and that out of galaxy  (0, 29) (1, 29)
Features extraction begins...
Object relevant data in the Galaxy has no data, nothing to predict.
In terms of that out of the Galaxy...
Adding feats for the flux mean per band...
Feats added: ['band_0_flux_mean', 'band_1_flux_mean', 'band_2_flux_mean', 'band_3_flux_mean', 'band_4_flux_mean', 'band_5_flux_mean', 'band_1_flux_mean_minus_band_0_flux_mean', 'band_2_flux_mean_minus_ba