In [3]:
import os, sys, glob
import numpy as np
import pandas as pd

import time
import datetime

from joblib import Parallel, delayed
from sklearn.metrics import f1_score, log_loss, classification_report
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb

%pylab inline
%matplotlib inline

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [4]:
def read_feat(path, test_mode=False):
    df = pd.read_csv(path)
    df = df.iloc[::-1]
    
    if test_mode:
        df_feat = [df['渔船ID'].iloc[0], df['type'].iloc[0]]
        df = df.drop(['type'], axis=1)
    else:
        df_feat = [df['渔船ID'].iloc[0]]
        
    df['time'] = df['time'].apply(lambda x: datetime.datetime.strptime(x, "%m%d %H:%M:%S"))
    df_diff = df.diff(1).iloc[1:]
    df_diff['time_seconds'] = df_diff['time'].dt.total_seconds()
    df_diff['dis'] = np.sqrt(df_diff['x']**2 + df_diff['y']**2)
    
    df_feat.append(df['time'].dt.day.nunique())
    df_feat.append(df['time'].dt.hour.min())
    df_feat.append(df['time'].dt.hour.max())
    df_feat.append(df['time'].dt.hour.value_counts().index[0])

    df_feat.append(df['速度'].min())
    df_feat.append(df['速度'].max())
    df_feat.append(df['速度'].mean())

    # df_feat.append(df_diff['time'].min())
    # df_feat.append(df_diff['time'].max())
    # df_feat.append(df_diff['time'].mean())
    
    df_feat.append(df_diff['速度'].min())
    df_feat.append(df_diff['速度'].max())
    df_feat.append(df_diff['速度'].mean())
    df_feat.append((df_diff['速度'] > 0).mean())
    df_feat.append((df_diff['速度'] == 0).mean())

    df_feat.append(df_diff['方向'].min())
    df_feat.append(df_diff['方向'].max())
    df_feat.append(df_diff['方向'].mean())
    df_feat.append((df_diff['方向'] > 0).mean())
    df_feat.append((df_diff['方向'] == 0).mean())

    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).min())
    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).max())
    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).mean())
    df_feat.append((df_diff['x'] > 0).mean())
    df_feat.append((df_diff['x'] == 0).mean())

    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).min())
    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).max())
    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).mean())
    df_feat.append((df_diff['y'] > 0).mean())
    df_feat.append((df_diff['y'] == 0).mean())
    
    df_feat.append(df_diff['dis'].min())
    df_feat.append(df_diff['dis'].max())
    df_feat.append(df_diff['dis'].mean())

    df_feat.append((df_diff['dis']/df_diff['time_seconds']).min())
    df_feat.append((df_diff['dis']/df_diff['time_seconds']).max())
    df_feat.append((df_diff['dis']/df_diff['time_seconds']).mean())
    
    return df_feat

In [7]:
read_feat('./hy_round1_train_20200102/4285.csv', True)

[4285,
 '围网',
 3,
 0,
 23,
 23,
 0.0,
 9.55,
 3.0738539042821165,
 -9.01,
 8.790000000000001,
 -0.005580808080808082,
 0.4444444444444444,
 0.07575757575757576,
 -351.0,
 354.0,
 -0.44696969696969696,
 0.5277777777777778,
 0.027777777777777776,
 0.0,
 4.4185102869974,
 0.7784534697667036,
 0.6313131313131313,
 0.030303030303030304,
 0.0,
 3.0732464128717805,
 1.070365546510406,
 0.47474747474747475,
 0.030303030303030304,
 0.0,
 4320.018562695771,
 992.0288337451307,
 0.0,
 4.542562887633425,
 1.5332574748152]

In [20]:
train_feat = Parallel(n_jobs=10)(delayed(read_feat)(path, True) 
                                 for path in glob.glob('./hy_round1_train_20200102/*')[:])
train_feat = pd.DataFrame(train_feat)

test_feat = Parallel(n_jobs=10)(delayed(read_feat)(path, False) 
                                 for path in glob.glob('./hy_round1_testA_20200102/*')[:])
test_feat = pd.DataFrame(test_feat)


In [21]:
test_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,7000,4,0,23,1,0.0,10.09,1.656139,-9.39,9.12,...,4.994467,0.496579,0.236559,0.497312,0.0,6082.455686,593.535312,0.0,5.089298,0.837550
1,7001,3,0,23,14,0.0,10.09,3.074476,-9.39,9.29,...,126.363692,1.499711,0.396061,0.157549,0.0,13958.643844,995.684493,0.0,126.896762,1.903650
2,7002,3,0,23,12,0.0,10.09,2.985488,-7.23,8.09,...,2.991093,0.867465,0.488998,0.031785,0.0,5690.304305,892.238179,0.0,4.761761,1.410623
3,7003,4,0,23,0,0.0,10.09,1.132212,-9.98,8.47,...,3.723221,0.283304,0.132075,0.709906,0.0,4694.982758,338.245548,0.0,5.461123,0.545136
4,7004,4,0,23,20,0.0,10.09,1.473442,-10.09,9.77,...,4.237397,0.441845,0.115869,0.700252,0.0,5556.659805,441.670678,0.0,7.158958,0.654401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,8995,3,0,23,2,0.0,10.09,1.994945,-9.98,9.39,...,2.746933,0.170127,0.148760,0.696970,0.0,3357.754627,484.767891,0.0,5.586946,0.804945
1996,8996,3,0,23,1,0.0,10.09,3.227839,-7.77,7.88,...,15.554984,1.057452,0.272222,0.486111,0.0,32620.211891,1000.564466,0.0,17.623112,1.274932
1997,8997,3,0,23,14,0.0,0.38,0.064986,-0.27,0.22,...,0.000365,0.000008,0.011364,0.977273,0.0,99.147012,2.253341,0.0,0.165521,0.003716
1998,8998,3,0,23,12,0.0,10.09,1.999184,-7.82,7.72,...,6.498953,0.836696,0.175758,0.372727,0.0,8812.669274,689.943998,0.0,7.331672,1.024509


In [22]:
test_feat = test_feat.sort_values(by=0)

In [23]:
test_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,7000,4,0,23,1,0.0,10.09,1.656139,-9.39,9.12,...,4.994467,0.496579,0.236559,0.497312,0.0,6082.455686,593.535312,0.0,5.089298,0.837550
1,7001,3,0,23,14,0.0,10.09,3.074476,-9.39,9.29,...,126.363692,1.499711,0.396061,0.157549,0.0,13958.643844,995.684493,0.0,126.896762,1.903650
2,7002,3,0,23,12,0.0,10.09,2.985488,-7.23,8.09,...,2.991093,0.867465,0.488998,0.031785,0.0,5690.304305,892.238179,0.0,4.761761,1.410623
3,7003,4,0,23,0,0.0,10.09,1.132212,-9.98,8.47,...,3.723221,0.283304,0.132075,0.709906,0.0,4694.982758,338.245548,0.0,5.461123,0.545136
4,7004,4,0,23,20,0.0,10.09,1.473442,-10.09,9.77,...,4.237397,0.441845,0.115869,0.700252,0.0,5556.659805,441.670678,0.0,7.158958,0.654401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,8995,3,0,23,2,0.0,10.09,1.994945,-9.98,9.39,...,2.746933,0.170127,0.148760,0.696970,0.0,3357.754627,484.767891,0.0,5.586946,0.804945
1996,8996,3,0,23,1,0.0,10.09,3.227839,-7.77,7.88,...,15.554984,1.057452,0.272222,0.486111,0.0,32620.211891,1000.564466,0.0,17.623112,1.274932
1997,8997,3,0,23,14,0.0,0.38,0.064986,-0.27,0.22,...,0.000365,0.000008,0.011364,0.977273,0.0,99.147012,2.253341,0.0,0.165521,0.003716
1998,8998,3,0,23,12,0.0,10.09,1.999184,-7.82,7.72,...,6.498953,0.836696,0.175758,0.372727,0.0,8812.669274,689.943998,0.0,7.331672,1.024509


In [24]:
train_feat[1] = train_feat[1].map({'围网':0,'刺网':1,'拖网':2})

In [25]:
train_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0,2,4,0,23,15,0.00,9.39,0.265966,-6.80,...,2.181406,0.029993,0.019370,0.944310,0.0,4745.887438,87.089644,0.0,5.125652,0.136680
1,1,2,4,0,23,18,0.00,10.47,1.607922,-3.19,...,4.886008,0.462122,0.153646,0.632812,0.0,5828.114792,494.874699,0.0,5.232657,0.740035
2,10,2,4,0,23,22,0.00,10.09,1.313854,-6.80,...,2.536566,0.320474,0.191919,0.608586,0.0,5526.897410,414.501179,0.0,4.694111,0.648485
3,100,2,3,0,23,0,0.00,8.69,2.965864,-5.40,...,4.030897,0.413737,0.456098,0.170732,0.0,3266.637624,930.294733,0.0,4.271592,1.477071
4,1000,0,3,0,23,0,0.00,8.90,2.085570,-5.77,...,3.943120,0.673804,0.313830,0.252660,0.0,10831.412476,679.109667,0.0,4.317332,0.946626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,995,2,3,0,23,1,0.11,10.09,3.321114,-6.47,...,8.811690,1.274997,0.575419,0.008380,0.0,6648.767740,1225.476522,0.0,9.687755,1.733673
6996,996,2,3,0,23,8,0.00,10.09,0.976370,-4.31,...,2.969736,0.250302,0.216867,0.732530,0.0,4513.954050,267.052888,0.0,4.860630,0.425823
6997,997,1,3,0,23,22,0.00,8.31,0.267378,-8.09,...,0.377361,0.012981,0.039755,0.923547,0.0,325.030029,10.173476,0.0,0.545352,0.016506
6998,998,2,3,0,23,0,0.00,1.57,0.040175,-1.19,...,0.244863,0.008702,0.025063,0.949875,0.0,108.983179,5.462816,0.0,0.244906,0.008703


In [29]:
from lightgbm import early_stopping
callbacks = [ early_stopping(stopping_rounds=500)]

In [33]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

n_fold = 10
skf = StratifiedKFold(n_splits = n_fold, shuffle = True)
eval_fun = f1_score

def run_oof(clf, X_train, y_train, X_test, kf):
    print(clf)
    preds_train = np.zeros((len(X_train), 3), dtype = float)
    preds_test = np.zeros((len(X_test), 3), dtype = float)
    train_loss = []; test_loss = []

    i = 1
    for train_index, test_index in kf.split(X_train, y_train):
        x_tr = X_train[train_index]; x_te = X_train[test_index]
        y_tr = y_train[train_index]; y_te = y_train[test_index]
        clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], callbacks=callbacks)
        
        train_loss.append(eval_fun(y_tr, np.argmax(clf.predict_proba(x_tr)[:], 1), average='macro'))
        test_loss.append(eval_fun(y_te, np.argmax(clf.predict_proba(x_te)[:], 1), average='macro'))

        preds_train[test_index] = clf.predict_proba(x_te)[:]
        preds_test += clf.predict_proba(X_test)[:]

        print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss)))
        print('-' * 50)
        i += 1
    print('Train: ', train_loss)
    print('Val: ', test_loss)
    print('-' * 50)
    print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss)))
    preds_test /= n_fold
    return preds_train, preds_test

params = {
    'learning_rate': 0.01,
    'min_child_samples': 5,
    'max_depth': 7,
    'lambda_l1': 2,
    'boosting': 'gbdt',
    'objective': 'multiclass',
    'n_estimators': 2000,
    'metric': 'multi_error',
    'num_class': 3,
    'feature_fraction': .75,
    'bagging_fraction': .85,
    'seed': 99,
    'num_threads': 20,
    'verbose': -1
}

train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), 
                                train_feat.iloc[:, 2:].values, 
                                train_feat.iloc[:, 1].values, 
                                test_feat.iloc[:, 1:].values, 
                                skf)


LGBMClassifier(bagging_fraction=0.85, boosting='gbdt', feature_fraction=0.75,
               lambda_l1=2, learning_rate=0.01, max_depth=7,
               metric='multi_error', min_child_samples=5, n_estimators=2000,
               num_class=3, num_threads=20, objective='multiclass', seed=99,
               verbose=-1)
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[1595]	valid_0's multi_error: 0.24
1: Train 0.9683795 Val 0.6720877/0.6720877
--------------------------------------------------
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[1982]	valid_0's multi_error: 0.252857
2: Train 0.9822805 Val 0.6555483/0.6638180
--------------------------------------------------
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[1041]	valid_0's multi_error: 0.255714
3: Train 0.9211861 Val 0.6435518/0.6570626
-------------------

In [34]:
test_feat['label'] = np.argmax(test_pred, 1)
test_feat['label'] = test_feat['label'].map({0:'围网',1:'刺网',2:'拖网'})
test_feat[[0, 'label']].to_csv('baseline.csv',index=None, header=None)