## 准备

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler
import datetime


from keras.models import Sequential
from keras.layers import Dense,BatchNormalization,Dropout
from keras.callbacks import ReduceLROnPlateau,ModelCheckpoint
from keras.utils import to_categorical
import tensorflow as tf
from keras import backend as K
import keras
from keras import regularizers
from collections import Counter
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


In [3]:
def multi_weighted_logloss(y_ohe, y_p):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
#     classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
#     class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}

    print('Number of classes :', len(y_ohe[0]))
    
    if len(y_ohe[0]) == 14:
        classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
        class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    
    # Galaxy Case
    if len(y_ohe[0]) == 5:
        classes = [6, 16, 53, 65, 92]
        class_weight = {6: 1, 16: 1, 53: 1, 65: 1, 92: 1}
        
    # Out of Galaxy Case
    if len(y_ohe[0]) == 9:
        classes = [15, 42, 52, 62, 64, 67, 88, 90, 95]
        class_weight = {15: 2, 42: 1, 52: 1, 62: 1, 64: 2, 67: 1, 88: 1, 90: 1, 95: 1}
        
        
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1-1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set 
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos    
    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss

## 特征工程

In [None]:
def set_df(arr, col_names):
    df = pd.DataFrame(arr)
    df.columns = col_names
    return df

def get_new_columns(aggs):
    return [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

def agg_by_flux_feats(df):
    
    df['flux_ratio'] = df['flux'] / df['flux_err']
    
    df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)
    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']
    
    aggs = {
#         'mjd': ['min', 'max', 'size'],
#         'passband': ['mean', 'std', 'var'],  
        'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'flux_err': ['min', 'max', 'mean', 'median', 'std'],
        'flux_ratio': ['min', 'max', 'mean', 'std'],
        'detected': ['mean'],  # ''min', 'max', 'mean', 'median', 'std'],
    }   

#     aggs['flux_ratio_sq'] = ['sum']
#     aggs['flux_by_flux_ratio_sq'] = ['sum']

    
    agg_df = df.groupby('object_id').agg(aggs)
    new_columns = get_new_columns(aggs)
    agg_df.columns = new_columns

    agg_df = add_flux_second_order_features_to_agg(df=agg_df)
    
    return agg_df

def add_flux_second_order_features_to_agg(df):
#     df['mjd_diff'] = df['mjd_max'] - df['mjd_min']
    df['flux_diff'] = df['flux_max'] - df['flux_min']
    df['flux_dif2'] = (df['flux_max'] - df['flux_min']) / df['flux_mean']
#     df['flux_w_mean'] = df['flux_by_flux_ratio_sq_sum'] / df['flux_ratio_sq_sum']
#     df['flux_dif3'] = (df['flux_max'] - df['flux_min']) / df['flux_w_mean']

#     del df['mjd_max'], df['mjd_min']

    return df


def get_by_galaxy(df):
    df_in_gal = df[df['in_galaxy']==1]
    objects_in_gal = df_in_gal['object_id'].unique().tolist()
    print('Number of objects in galaxy :',len(objects_in_gal))
    
    df_out_gal = df[df['in_galaxy']==0]
    objects_out_gal = df_out_gal['object_id'].unique().tolist()
    print('Number of objects out of galaxy :',len(objects_out_gal))
    print('Just to check, sum of objects :', len(objects_in_gal) + len(objects_out_gal))
    print('Total number should be', len(df['object_id'].unique().tolist()))
    return df_in_gal, df_out_gal


def add_feats_within_time_interval_out(int_n, df, db):
    print('Number of Intervals :', int_n)
    t_min = db.mjd.min()
    t_max = db.mjd.max()
    print('Min and Max MJD time : {}, {}'.format(t_min, t_max))    
    int_dur = (t_max - t_min)/int_n
    for i in range(int_n):
        
        db_fil = db[(db.mjd>=(t_min+i*int_dur))&(db.mjd<(t_min + (i+1)*int_dur))][['object_id','flux','passband']]
        print('Interval #{}, record quantity: {}'.format(i+1, db_fil.shape[0]))
        
        # interval_#_flux_？
        stats = db_fil.groupby('object_id', as_index=False)['flux'].agg({'interval_{}_flux_mean'.format(i+1):'mean',
#                                                                           'interval_{}_flux_std'.format(i+1):'std',
                                                                          'interval_{}_flux_min'.format(i+1):'min',
                                                                          'interval_{}_flux_max'.format(i+1):'max',
#                                                                          'interval_{}_flux_skew'.format(i+1):'skew'
                                                                        })
#         print('New features added: ',stats.columns.tolist())
        df = df.merge(stats, on='object_id', how='left')
        
# 仅仅看看神经网络在多特征情况下表现如何         
        # interval_#_band_#_flux_？
        stats = db_fil.groupby(['object_id','passband'])['flux'].skew().unstack()
        stats.columns = ['interval_{}_band_{}_flux_skew'.format(i+1, str(col)) for col in stats.columns.tolist()]
        print('Feats added:',stats.columns.tolist())
        stats['object_id'] = stats.index    
        df = df.merge(stats, on='object_id', how='left').fillna(0) 
# 仅仅看看神经网络在多特征情况下表现如何         
                
    # interval_#_flux_？ 互相做差
    for key in ['max', 'min', 'mean']:
#     for key in ['mean']:
        key_cols = ['interval_{}_flux_{}'.format(i, key) for i in range(1, int_n+1)]
        for col in key_cols:
            subtract_cols = [col_ for col_ in key_cols if col_ < col]
            for sub_col in subtract_cols:
                df['{}_minus_{}'.format(col, sub_col)] = df[col] - df[sub_col]
                print('Feature added:', '{}_minus_{}'.format(col, sub_col))
        

    
    print('Dimension of data after adding features relevant to time intervals', df.shape)
    
    return df

def add_feats_within_time_interval(int_n, df, db):
    print('Number of Intervals :', int_n)
    t_min = db.mjd.min()
    t_max = db.mjd.max()
    print('Min and Max MJD time : {}, {}'.format(t_min, t_max))    
    int_dur = (t_max - t_min)/int_n
    for i in range(int_n):
        
        db_fil = db[(db.mjd>=(t_min+i*int_dur))&(db.mjd<(t_min + (i+1)*int_dur))][['object_id','flux','passband']]
        print('Interval #{}, record quantity: {}'.format(i+1, db_fil.shape[0]))
        
        # interval_#_flux_？
        stats = db_fil.groupby('object_id', as_index=False)['flux'].agg({'interval_{}_flux_mean'.format(i+1):'mean',
                                                                          'interval_{}_flux_std'.format(i+1):'std',
                                                                          'interval_{}_flux_min'.format(i+1):'min',
                                                                          'interval_{}_flux_max'.format(i+1):'max',
                                                                         'interval_{}_flux_skew'.format(i+1):'skew'})
#         print('New features added: ',stats.columns.tolist())
        df = df.merge(stats, on='object_id', how='left')
        
# 仅仅看看神经网络在多特征情况下表现如何        
        # interval_#_band_#_flux_？
        stats = db_fil.groupby(['object_id','passband'])['flux'].skew().unstack()
        stats.columns = ['interval_{}_band_{}_flux_skew'.format(i+1, str(col)) for col in stats.columns.tolist()]
        print('Feats added:',stats.columns.tolist())
        stats['object_id'] = stats.index    
        df = df.merge(stats, on='object_id', how='left').fillna(0) 
# 仅仅看看神经网络在多特征情况下表现如何         
                
    # interval_#_flux_？ 互相做差
    for key in ['max', 'min', 'mean']:
#     for key in ['max']:
        key_cols = ['interval_{}_flux_{}'.format(i, key) for i in range(1, int_n+1)]
        for col in key_cols:
            subtract_cols = [col_ for col_ in key_cols if col_ < col]
            for sub_col in subtract_cols:
                df['{}_minus_{}'.format(col, sub_col)] = df[col] - df[sub_col]
#                 print('Feature added:', '{}_minus_{}'.format(col, sub_col))
         

    
    print('Dimension of data after adding features relevant to time intervals', df.shape)
    
    return df




def add_band_feats(df, db):
    
    
### 均值
    print('Adding feats for the flux mean per band...')
    stats = db.groupby(['object_id','passband'])['flux'].mean().unstack()
    stats.columns = ['band_' + str(col) + '_flux_mean' for col in stats.columns.tolist()]
    
    # band_#_flux_mean互相减去
    mean_cols = stats.columns.tolist()
    for col in mean_cols:
        subtract_cols = [col_ for col_ in mean_cols if col_ < col]
        for sub_col in subtract_cols:
            stats['{}_minus_{}'.format(col, sub_col)] = stats[col] - stats[sub_col]
      
    # print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0)
    
    
### 标准差    
    print('Adding feats for the flux std per band...')
    stats = db.groupby(['object_id','passband'])['flux'].std().unstack()
    stats.columns = ['band_' + str(col) + '_flux_std' for col in stats.columns.tolist()]
    # print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0)

    
### 偏度    
    print('Adding feats for the flux skew per band...')
    stats = db.groupby(['object_id','passband'])['flux'].skew().unstack()
    stats.columns = ['band_' + str(col) + '_flux_skew' for col in stats.columns.tolist()]
    # print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0) 
    

### 最大值
    print('Adding feats for the flux max per band...')
    stats = db.groupby(['object_id','passband'])['flux'].max().unstack()
    stats.columns = ['band_' + str(col) + '_flux_max' for col in stats.columns.tolist()]
    # band_#_flux_max互相减去
    max_cols = stats.columns.tolist()
    for col in max_cols:
        subtract_cols = [col_ for col_ in max_cols if col_ < col]
        for sub_col in subtract_cols:
            stats['{}_minus_{}'.format(col, sub_col)] = stats[col] - stats[sub_col]
            
    # print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0) 
    

    
### 最小值    
    print('Adding feats for the flux min per band...')
    stats = db.groupby(['object_id','passband'])['flux'].min().unstack()
    stats.columns = ['band_' + str(col) + '_flux_min' for col in stats.columns.tolist()]
    
#     # band_#_flux_min互相做差
#     min_cols = stats.columns.tolist()
#     for col in min_cols:
#         subtract_cols = [col_ for col_ in min_cols if col_ < col]
#         for sub_col in subtract_cols:
#             stats['{}_minus_{}'.format(col, sub_col)] = stats[col] - stats[sub_col]
            
    # print('Feats added:',stats.columns.tolist())
    stats['object_id'] = stats.index    
    df = df.merge(stats, on='object_id', how='left').fillna(0) 
    

    
    
# ### ......  Mean
#     print('Adding feats for the flux_err mean per band...')
#     stats = db.groupby(['object_id','passband'])['flux_err'].mean().unstack()
#     stats.columns = ['band_' + str(col) + '_flux_err_mean' for col in stats.columns.tolist()]      
#     print('Feats added:',stats.columns.tolist())
#     stats['object_id'] = stats.index    
#     df = df.merge(stats, on='object_id', how='left').fillna(0)
    
    
    
    
### 遍历band_list计算 
    print('Adding feats for the flux (max-min)/mean per band...')
    for band_n in range(6):
        df['band_' + str(band_n) + '_flux_diff1'] = df['band_' + str(band_n) + '_flux_max'] - df['band_' + str(band_n) + '_flux_min']
        df['band_' + str(band_n) + '_flux_diff2'] = df['band_' + str(band_n) + '_flux_diff1']/df['band_' + str(band_n) + '_flux_mean']
        # print('Feature added: band_' + str(band_n) + '_flux_diff2')
        
#         df['band_' + str(band_n) + '_flux_err_ratio'] = df['band_' + str(band_n) + '_flux_err_mean']/df['band_' + str(band_n) + '_flux_mean']
#         print('Feature added: band_' + str(band_n) + '_flux_err_ratio')
        
        
        

    


    print('Dimension of data after adding features relevant to bands', df.shape)
    
    return df

    

    
    

def add_flux_second_order_features_to_agg(df):
#     df['mjd_diff'] = df['mjd_max'] - df['mjd_min']
    df['flux_diff'] = df['flux_max'] - df['flux_min']
    df['flux_dif2'] = (df['flux_max'] - df['flux_min']) / df['flux_mean']
#     df['flux_w_mean'] = df['flux_by_flux_ratio_sq_sum'] / df['flux_ratio_sq_sum']
#     df['flux_dif3'] = (df['flux_max'] - df['flux_min']) / df['flux_w_mean']

#     del df['mjd_max'], df['mjd_min']

    return df

def add_photo_feats(df):
    df['hostgal_photoz_ratio'] = df['hostgal_photoz']/df['hostgal_photoz_err']
    
    return df



def fabriquer_feat(db, meta):
    
    # # 去除无效特征
    # del meta['hostgal_specz']
    # print('Feature hostgal_specz is removed')
    
    # META数据提供划分银河系内外的依据
    # 增加是否属于银河系的特征
    meta.distmod.fillna(0,inplace=True)
    meta['in_galaxy'] = 0
    meta.loc[(meta.distmod == 0), 'in_galaxy'] = 1
    
    # 时序数据和META数据融合，形成以mjd为行的数据
    db_meta = db.merge(meta, on='object_id', how='left')
    print('Dimension of merge data for MJD relevant data and META data ', db_meta.shape)
    
    # 对时序融合数据进行分割
    db_in_gal, db_out_gal = get_by_galaxy(db_meta)
    print('Dimension of merge data for that in galaxy and that out of galaxy ', db_in_gal.shape, db_out_gal.shape)
    
    # 基本特征聚合
    agg_df = agg_by_flux_feats(db)
    print('Dimension of aggregated data on flux features', agg_df.shape)
    
    # 聚合数据和META数据融合，形成以object_id为行的数据
    agg_df_meta = agg_df.merge(meta, on='object_id', how='left')
    print('Dimension of merge data for Object relevant data and META data', agg_df_meta.shape)

    # 对object融合数据进行分割
    df_in_gal, df_out_gal = get_by_galaxy(agg_df_meta)
    print('Dimension of merge data for that in galaxy and that out of galaxy ', df_in_gal.shape, df_out_gal.shape)

    if df_in_gal.shape[0] >0:
        # 对银河系内数据提取特征
        print('Features extraction for objects in the Galaxy begins...')
        
        # 增加band相关特征
        df_in_gal = add_band_feats(df_in_gal, db_in_gal)    
        
        # 增加按MJD划分时间统计得到的特征
        df_in_gal = add_feats_within_time_interval(6, df_in_gal, db_in_gal)
        
    
    if df_out_gal.shape[0] >0:
        # 对银河系外数据提取特征
        print('Features extraction for objects out of the Galaxy begins...')
        
        # 增加hostgal_photoz相关特征
        df_out_gal = add_photo_feats(df_out_gal)
        
        # 增加band相关特征
        df_out_gal = add_band_feats(df_out_gal, db_out_gal)    
        
        # 增加按MJD划分时间统计得到的特征
        df_out_gal = add_feats_within_time_interval_out(6, df_out_gal, db_out_gal) 
    
    return df_in_gal, df_out_gal

## 神经网络

In [4]:
def plot_loss_acc(history):
    plt.plot(history.history['loss'][1:])
    plt.plot(history.history['val_loss'][1:])
    plt.title('model loss')
    plt.ylabel('val_loss')
    plt.xlabel('epoch')
    plt.legend(['train','Validation'], loc='upper left')
    plt.show()
    
    plt.plot(history.history['acc'][1:])
    plt.plot(history.history['val_acc'][1:])
    plt.title('model Accuracy')
    plt.ylabel('val_acc')
    plt.xlabel('epoch')
    plt.legend(['train','Validation'], loc='upper left')
    plt.show()
    

def to_cat(y):    
    classes = sorted(np.unique(y))
    
    unique_y = np.unique(y)
    class_map = dict()

    for i,val in enumerate(unique_y):
        class_map[val] = i            
    y_map = np.zeros((y.shape[0],))
    y_map = np.array([class_map[val] for val in y])
    y_categorical = to_categorical(y_map)    
    
    return y_categorical
    
    
    
def train_by_nn(full_train, y):
    full_train_new = full_train.copy()
    ss = StandardScaler()
    full_train_ss = ss.fit_transform(full_train_new)
    
    classes = sorted(y.unique())
    
    unique_y = np.unique(y)
    class_map = dict()

    for i,val in enumerate(unique_y):
        class_map[val] = i            
    y_map = np.zeros((y.shape[0],))
    y_map = np.array([class_map[val] for val in y])
    y_categorical = to_categorical(y_map)    

    
    y_count = Counter(y_map)
    wtable = np.zeros((len(unique_y),))
    for i in range(len(unique_y)):
        wtable[i] = y_count[i]/y_map.shape[0]    
    
    
    def mywloss(y_true, y_pred):  
  
        yc=tf.clip_by_value(y_pred,1e-15,1-1e-15)
        loss=-(tf.reduce_mean(tf.reduce_mean(y_true*tf.log(yc),axis=0)/wtable))
        return loss
    
#     K.clear_session()
    def build_model(dropout_rate=0.25,activation='relu'):
        start_neurons = 512
        # create model
        model = Sequential()
        model.add(Dense(start_neurons, input_dim=full_train_ss.shape[1], activation=activation))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))
        
        model.add(Dense(start_neurons//2,activation=activation))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))
        
        model.add(Dense(start_neurons//4,activation=activation))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))
        
        model.add(Dense(start_neurons//8,activation=activation))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate/2))
        
        model.add(Dense(len(classes), activation='softmax'))
        return model    
    
        
    clfs = []
    oof_preds = np.zeros((len(full_train_ss), len(classes)))
    epochs = 600
    batch_size = 100
    checkPoint = ModelCheckpoint("./keras.model",monitor='val_loss',mode = 'min', save_best_only=True, verbose=0)
    for fold_, (trn_, val_) in enumerate(folds.split(y_map, y_map)):
        x_train, y_train = full_train_ss[trn_], y_categorical[trn_]
        x_valid, y_valid = full_train_ss[val_], y_categorical[val_]
        
        model = build_model(dropout_rate=0.5,activation='tanh')    
        model.compile(loss=mywloss, optimizer='adam', metrics=['accuracy'])
        history = model.fit(x_train, y_train,
                        validation_data=[x_valid, y_valid], 
                        epochs=epochs,
                        batch_size=batch_size,shuffle=True,verbose=0,callbacks=[checkPoint])       
        
        plot_loss_acc(history)
        
        print('Loading Best Model')
        model.load_weights('./keras.model')
        # # Get predicted probabilities for each class
        oof_preds[val_, :] = model.predict_proba(x_valid,batch_size=batch_size)
        print(multi_weighted_logloss(y_valid, model.predict_proba(x_valid,batch_size=batch_size)))
        clfs.append(model)
    
    return ss, y_categorical, oof_preds, clfs

### 结果分析

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig = plt.figure(figsize=(20,10))
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
    fig.savefig(r'../feat/confusion_matrix_{}.pdf'.format(datetime.datetime.now().strftime('%m%d_%H%M')))
    
    

def get_confusion_matrix(y, preds):
    unique_y = np.unique(y)
    class_map = dict()
    for i,val in enumerate(unique_y):
        class_map[val] = i
            
#     y_map = np.zeros((y.shape[0],))
    y_map = np.array([class_map[val] for val in y]) 
    
    cnf_matrix = confusion_matrix(y_map, np.argmax(preds, axis=-1))  
    np.set_printoptions(precision=2)
    
    
    sample_sub = pd.read_csv('../input/sample_submission.csv')
    class_names = list(sample_sub.columns[1:-1])
    del sample_sub;gc.collect()
    
    # Plot non-normalized confusion matrix
    plt.figure(figsize=(10,10))
    foo = plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                          title='Confusion matrix')

## 主程序

### 数据准备

In [5]:
gc.enable()

train = pd.read_csv('../input/training_set.csv')

meta_train = pd.read_csv('../input/training_set_metadata.csv')

### 特征提取

In [6]:
# 去除无效特征
del meta_train['hostgal_specz']
print('Feature hostgal_specz is removed')

full_train_in_gal, full_train_out_gal = fabriquer_feat(train, meta_train)

for df in [full_train_in_gal, full_train_out_gal]:
    del df['object_id']
    df_mean = df.mean(axis=0)
    df.fillna(df_mean, inplace=True)

del meta_train， train

Feature hostgal_specz is removed


NameError: name 'fabriquer_feat' is not defined

### 训练

In [None]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

y_in_gal = full_train_in_gal['target']

train_in_gal = full_train_in_gal.copy()

del train_in_gal['target']

# print('Training begins...')

val_score_list = []
clf_list = []

ss_in, y_categorical_in_gal, oof_preds_in_gal, clf_in = train_by_nn(train_in_gal, y_in_gal)

score_in_gal = multi_weighted_logloss(y_categorical_in_gal,oof_preds_in_gal)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % score_in_gal)
val_score_list.append(score_in_gal)
clf_list.append(clf_in)


y_out_gal = full_train_out_gal['target']

train_out_gal = full_train_out_gal.copy()

del train_out_gal['target']

ss_out, y_categorical_out_gal, oof_preds_out_gal, clf_out = train_by_nn(train_out_gal, y_out_gal)

score_out_gal = multi_weighted_logloss(y_categorical_out_gal,oof_preds_out_gal)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % score_out_gal)
val_score_list.append(score_out_gal)
clf_list.append(clf_out)


all_y = np.concatenate((y_in_gal.values, y_out_gal.values), axis=0)

in_classes = [6, 16, 53, 65, 92]
out_classes = [15, 42, 52, 62, 64, 67, 88, 90, 95]

in_df = set_df(oof_preds_in_gal, in_classes)
out_df = set_df(oof_preds_out_gal, out_classes)
in_out_df = pd.concat([in_df, out_df], axis=0).fillna(0)


print('Just double check:', multi_weighted_logloss_OLDVERSION(y_true=all_y, y_preds=in_out_df.values))

all_y_cat = to_cat(all_y)

tot_score = multi_weighted_logloss(all_y_cat, in_out_df.values)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % tot_score)

val_score_list.append(tot_score)



score_tab = pd.DataFrame({'Model':['Galaxy_Model', 'Extragalaxy_Model','Bi_Model'], 'Score':val_score_list})
print(score_tab)
score_tab.to_csv(r'../feat/validation_scores_{}.csv'.format(datetime.datetime.now().strftime('%m%d_%H%M')), index=False)

### 结果分析

In [None]:
get_confusion_matrix(all_y, in_out_df.values)   