In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
datadir = Path('../data/atmacup5')
train_df = pd.read_csv(datadir / 'train.csv')
test_df = pd.read_csv(datadir / 'test.csv')
fitting_df = pd.read_csv(datadir / 'fitting.csv')

In [3]:
def read_fitting_data():
    return pd.read_csv(datadir / 'fitting.csv')

In [4]:
wave_data = {}

files = (datadir / 'spectrum_raw').glob('*.dat')
for p in tqdm(files):
    _df = pd.read_csv(p, sep='\t', header=None)
    x = np.zeros(shape=(512,), dtype=np.float32)
    x[:len(_df)] = _df.values[:, 1]
    wave_data[os.path.basename(p)] = x

14388it [00:32, 444.04it/s]


In [5]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())
print(fitting_df.isnull().sum())

spectrum_id          0
spectrum_filename    0
chip_id              0
exc_wl               0
layout_a             0
layout_x             0
layout_y             0
pos_x                0
target               0
dtype: int64
spectrum_id          0
spectrum_filename    0
chip_id              0
exc_wl               0
layout_a             0
layout_x             0
layout_y             0
pos_x                0
dtype: int64
spectrum_id    0
params0        0
params1        0
params2        0
params3        0
params4        0
params5        0
params6        0
rms            0
beta           0
dtype: int64


In [6]:
train_df.head()

Unnamed: 0,spectrum_id,spectrum_filename,chip_id,exc_wl,layout_a,layout_x,layout_y,pos_x,target
0,000da4633378740f1ee8,b2e223339f4abce9b400.dat,79ad4647da6de6425abf,850,2,36,140,1313.081,0
1,000ed1a5a9fe0ad2b7dd,e2f150a503244145e7ce.dat,79ad4647da6de6425abf,780,3,0,168,159.415,0
2,0016e3322c4ce0700f9a,3d58b7ccaee157979cf0.dat,c695a1e61e002b34e556,780,1,34,29,-610.7688,0
3,00256bd0f8c6cf5f59c8,ed3641184d3b7c0ae703.dat,c695a1e61e002b34e556,780,2,32,139,1214.618,0
4,003483ee5ae313d37590,4c63418d39f86dfab9bb.dat,c695a1e61e002b34e556,780,0,45,85,-257.6161,0


In [7]:
train_df.columns

Index(['spectrum_id', 'spectrum_filename', 'chip_id', 'exc_wl', 'layout_a',
       'layout_x', 'layout_y', 'pos_x', 'target'],
      dtype='object')

In [8]:
fitting_df.columns

Index(['spectrum_id', 'params0', 'params1', 'params2', 'params3', 'params4',
       'params5', 'params6', 'rms', 'beta'],
      dtype='object')

In [9]:
train_df

Unnamed: 0,spectrum_id,spectrum_filename,chip_id,exc_wl,layout_a,layout_x,layout_y,pos_x,target
0,000da4633378740f1ee8,b2e223339f4abce9b400.dat,79ad4647da6de6425abf,850,2,36,140,1313.0810,0
1,000ed1a5a9fe0ad2b7dd,e2f150a503244145e7ce.dat,79ad4647da6de6425abf,780,3,0,168,159.4150,0
2,0016e3322c4ce0700f9a,3d58b7ccaee157979cf0.dat,c695a1e61e002b34e556,780,1,34,29,-610.7688,0
3,00256bd0f8c6cf5f59c8,ed3641184d3b7c0ae703.dat,c695a1e61e002b34e556,780,2,32,139,1214.6180,0
4,003483ee5ae313d37590,4c63418d39f86dfab9bb.dat,c695a1e61e002b34e556,780,0,45,85,-257.6161,0
...,...,...,...,...,...,...,...,...,...
7431,ffcc2d0e80130bcd1f66,677582af16aeb72c01df.dat,0b9dbf13f938efd5717f,850,0,8,56,-1441.3620,0
7432,ffd86d57b9d44f10c7d0,6f23369fb8e0d1fde118.dat,84b788fdc5e779f8a0df,850,3,12,2,543.2881,0
7433,ffdc78c1ca0a8c5a689f,825df3fcf8ce0570f0be.dat,6718e7f83c824b1e436d,780,0,41,102,-383.0251,0
7434,ffe1a53afdbab5ebddeb,5a2bd19c41cb6da70b33.dat,84b788fdc5e779f8a0df,850,3,3,124,259.5428,0


In [None]:
def create_main_table_fearue(input_df):
    cols = ['exc_wl', 'layout_a', 'layout_x', 'layout_y', 'pos_x']
    return input_df[cols].copy()

In [None]:
assert len(train_df) == len(create_main_table_fearue(train_df))

In [None]:
def create_fitting_feature(input_df):
    cols = ['spectrum_id', 'params0', 'params1', 'params2', 'params3', 'params4',
       'params5', 'params6', 'rms', 'beta']

    fitting_df = read_fitting_data()
    out_df = pd.merge(input_df['spectrum_id'], fitting_df[cols], on='spectrum_id', how='left')
    out_df = out_df.drop(columns=['spectrum_id'])
    return out_df

In [None]:
assert len(train_df) == len(create_fitting_feature(train_df))
assert len(create_fitting_feature(test_df).T) == len(create_fitting_feature(train_df).T)

In [None]:
def create_tsne_feature(input_df):
    cols = ['project_0', 'project_1']

    fitting_df = pd.read_csv(datadir / 'project_tsne.csv')
    out_df = pd.merge(input_df, fitting_df[cols], how='left')
    out_df = out_df.drop(columns=['spectrum_id'])
    return out_df

In [None]:
assert len(train_df) == len(create_fitting_feature(train_df))
assert len(create_fitting_feature(test_df).T) == len(create_fitting_feature(train_df).T)

In [None]:
class FeatureTransformer:
    def __init__(self, processors):
        self.processors = processors

    def to_feature(self, input_df):
        out_df = pd.DataFrame()
        for func in self.processors:
            print('start {}'.format(func.__name__))
            out_df = pd.concat([out_df, func(input_df)], axis=1)
        return out_df

In [None]:
feature_transformer = FeatureTransformer(processors=[create_fitting_feature, create_main_table_fearue])

In [None]:
feat_train_df = feature_transformer.to_feature(train_df)

In [None]:
feat_train_df

In [None]:
params = {
    'objective': 'binary',
    'learning_rate': 0.05,
    'reg_lambda': 1.,
    'n_estimators': 10000,
    'colsample_bytree': .7
}

X = feat_train_df.values
y = train_df['target'].values

In [None]:
from sklearn.model_selection import StratifiedKFold

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=71)
cv = list(fold.split(X, y))

In [None]:
from sklearn.metrics import average_precision_score
import lightgbm as lgbm

def pr_auc(y_true, y_pred):
    # 
    # y_true = data.get_label()
    score = average_precision_score(y_true, y_pred)
    return "pr_auc", score, True

def fit_lgbm(X, y, cv, params: dict=None, verbose=10):

    # パラメータがないときはからの dict で置き換える
    if params is None:
        params = {}

    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv):
        # training data を trian/valid に分割
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMClassifier(**params)
        clf.fit(x_train, y_train, 
                eval_set=[(x_valid, y_valid)],  
                early_stopping_rounds=100, 
                eval_metric=pr_auc,
                verbose=verbose)

        pred_i = clf.predict_proba(x_valid)[:, 1]
        oof_pred[idx_valid] = pred_i
        models.append(clf)

        print(f'Fold {i} PR-AUC: {average_precision_score(y_valid, pred_i):.4f}')

    score = average_precision_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models

In [None]:
oof, models = fit_lgbm(X, y, cv=cv, params=params)

In [None]:
def visualize_importance(models, feat_train_df):
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(len(order) * .4, 7))
    sns.boxenplot(data=feature_importance_df, x='column', y='feature_importance', order=order, ax=ax, palette='viridis')
    ax.tick_params(axis='x', rotation=90)
    fig.tight_layout()
    return fig, ax

In [None]:
visualize_importance(models, feat_train_df)