# 第三届阿里云安全大赛

- 因为数据序列有点长，每个pid最长有5000个调用序列，所以一开始接触赛题的时候还是想用TF-IDF这种比较传统比较快的特征提取做法来试一下。
- 特征主要包括三部分：① 一些手动的统计特征，比如最常出现的api名字的调用比例和数量、不同返回值的数量比例等。②TF-IDF特征，将调用序列看成文本，计算词频逆词频。③Doc2Vec特征，用训练好的doc2vec向量进行聚类，统计不同类别的数量及特则。
- 由于只花了一个星期来做这道题，而这一个星期也是心不在焉....所以很多东西没有时间去试一下，比如FB的fasttext。深度模型也是简单做一个RNN....
- 最后排名是38/622....还得继续努力

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import os
from scipy import stats
from collections import Counter
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA

%matplotlib inline

In [3]:
# 读取数据
train_data = pd.read_csv('input/train.csv')

test = pd.read_csv('input/test.csv')

In [6]:
train_data.head()

Unnamed: 0,file_id,label,api,tid,return_value,index
0,0,0,GetSystemTimeAsFileTime,2644,0,0
1,0,0,NtAllocateVirtualMemory,2644,0,1
2,0,0,NtFreeVirtualMemory,2644,0,2
3,0,0,NtAllocateVirtualMemory,2644,0,3
4,0,0,NtAllocateVirtualMemory,2644,0,4


In [5]:
# 创建特征DF
df_train = train_data.groupby('file_id').label.agg('first').reset_index()

## 一些统计特征

In [7]:
def processing_stat_fea(df, data, col):
    # 一些统计特征
    df = pd.merge(df, data[['file_id', col]].groupby(['file_id'])[col].nunique().reset_index().rename(columns={col: 'nunique_'+col}), on='file_id', how='left')

    t = data[['file_id', col]]
    t['num'] = 1
    t = t.groupby(['file_id', col]).num.agg('sum').reset_index()
    df = pd.merge(df, t.groupby(['file_id']).num.agg('mean').reset_index().rename(columns={'num': 'mean_num_'+col}), on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).num.agg('max').reset_index().rename(columns={'num': 'max_num_'+col}),on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).num.agg('min').reset_index().rename(columns={'num': 'min_num_'+col}),on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).num.agg('var').reset_index().rename(columns={'num': 'var_num_'+col}),on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).num.agg('skew').reset_index().rename(columns={'num': 'skew_num_'+col}),on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).num.apply(stats.kurtosis).reset_index().rename(columns={'num': 'kurt_num_'+col}),on='file_id', how='left')
    df['max_min_'+col] = df['max_num_'+col] - df['min_num_'+col]

    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().mean()).reset_index().rename(columns={0: 'diff_mean_num_'+col}), on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().max()).reset_index().rename(columns={0: 'diff_max_num_'+col}), on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().min()).reset_index().rename(columns={0: 'diff_min_num_'+col}), on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().var()).reset_index().rename(columns={0: 'diff_var_num_'+col}), on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().skew()).reset_index().rename(columns={0: 'diff_skew_num_'+col}), on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().kurt()).reset_index().rename(columns={0: 'diff_kurt_num_'+col}), on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().mad()).reset_index().rename(columns={0: 'diff_mad_num_'+col}), on='file_id', how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().max()-x['num'].sort_values().diff().min()).reset_index().rename(columns={0:'diff_seq_max_gap_min_num_'+col}),on=['file_id'],how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().diff().min()).reset_index().rename(columns={0:'diff2_min_gap_num_'+col}),on=['file_id'],how='left')
    df = pd.merge(df, t.groupby(['file_id']).apply(lambda x:x['num'].diff().diff().max()).reset_index().rename(columns={0:'diff2_max_gap_num_'+col}),on=['file_id'],how='left')
    
    return df

In [8]:
df_train = pd.merge(df_train, train_data[['file_id']].groupby(['file_id']).size().reset_index().rename(columns={0: 'num_invoke'}), on='file_id', how='left')

In [9]:
df_train = processing_stat_fea(df_train, train_data, 'api')
df_train = processing_stat_fea(df_train, train_data, 'tid')
df_train = processing_stat_fea(df_train, train_data, 'return_value')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
df_train.head()

Unnamed: 0,file_id,label,num_invoke,nunique_api,mean_num_api,max_num_api,min_num_api,var_num_api,skew_num_api,kurt_num_api,...,diff_mean_num_return_value,diff_max_num_return_value,diff_min_num_return_value,diff_var_num_return_value,diff_skew_num_return_value,diff_kurt_num_return_value,diff_mad_num_return_value,diff_seq_max_gap_min_num_return_value,diff2_min_gap_num_return_value,diff2_max_gap_num_return_value
0,0,0,424,19,22.315789,99,1,708.672515,1.346982,1.368728,...,14.6,338.0,-338.0,58225.8,-0.290972,1.736859,153.12,338.0,-676.0,412.0
1,1,5,2,2,1.0,1,1,0.0,,-3.0,...,,,,,,,,,,
2,2,5,34,15,2.266667,14,1,11.352381,3.444457,8.277387,...,-5.0,1.0,-25.0,125.5,-2.213827,4.920589,8.0,1.0,0.0,24.0
3,3,5,2800,65,43.076923,1128,1,37224.415865,5.516723,27.213945,...,-14.531915,15.0,-1162.0,14604.961336,-9.421704,90.143352,28.08239,15.0,-1002.0,1122.0
4,4,5,6832,78,87.589744,2970,1,157324.06327,6.093128,37.151675,...,-13.782895,2000.0,-4086.0,136949.60156,-7.860359,104.019402,54.901575,2000.0,-6086.0,4064.0


In [11]:
def processing_return_value(df, data):
    # 先针对0跟1
    df = pd.merge(df, train_data[
        train_data.return_value == 0][['file_id', 'return_value']
                                     ].groupby('file_id').size().reset_index().rename(columns={0: 'num_0_ret_val'}), 
                  on='file_id', how='left')
    df = pd.merge(df, train_data[
        train_data.return_value == 1][['file_id', 'return_value']
                                     ].groupby('file_id').size().reset_index().rename(columns={0: 'num_1_ret_val'}), 
                  on='file_id', how='left')    
    # 正负值
    df = pd.merge(df, train_data[
        train_data.return_value > 1][['file_id', 'return_value']
                                 ].groupby('file_id').size().reset_index().rename(columns={0: 'num_pos_ret_val'}), 
              on='file_id', how='left')   
    df = pd.merge(df, train_data[
        train_data.return_value < 0][['file_id', 'return_value']
                                 ].groupby('file_id').size().reset_index().rename(columns={0: 'num_neg_ret_val'}), 
              on='file_id', how='left')   
    
    df['ret_val_0_ratio'] = df['num_0_ret_val'] / df['num_invoke']
    df['ret_val_1_ratio'] = df['num_1_ret_val'] / df['num_invoke']
    df['ret_val_pos_ratio'] = df['num_pos_ret_val'] / df['num_invoke']
    df['ret_val_neg_ratio'] = df['num_neg_ret_val'] / df['num_invoke']
    return df

In [12]:
df_train = processing_return_value(df_train, train_data)

In [13]:
df_train.head()

Unnamed: 0,file_id,label,num_invoke,nunique_api,mean_num_api,max_num_api,min_num_api,var_num_api,skew_num_api,kurt_num_api,...,diff2_min_gap_num_return_value,diff2_max_gap_num_return_value,num_0_ret_val,num_1_ret_val,num_pos_ret_val,num_neg_ret_val,ret_val_0_ratio,ret_val_1_ratio,ret_val_pos_ratio,ret_val_neg_ratio
0,0,0,424,19,22.315789,99,1,708.672515,1.346982,1.368728,...,-676.0,412.0,340.0,,78.0,6.0,0.801887,,0.183962,0.014151
1,1,5,2,2,1.0,1,1,0.0,,-3.0,...,,,2.0,,,,1.0,,,
2,2,5,34,15,2.266667,14,1,11.352381,3.444457,8.277387,...,0.0,24.0,27.0,,7.0,,0.794118,,0.205882,
3,3,5,2800,65,43.076923,1128,1,37224.415865,5.516723,27.213945,...,-1002.0,1122.0,1368.0,1208.0,224.0,,0.488571,0.431429,0.08,
4,4,5,6832,78,87.589744,2970,1,157324.06327,6.093128,37.151675,...,-6086.0,4064.0,2120.0,4120.0,592.0,,0.310304,0.603044,0.086651,


In [15]:
def get_lx(df, data, col):
        # 连续登陆 最大 最小 平均 方差
    def checknum(v):
        #计算列表中连续=n的数目，返回最大连续数
        val = v.values
        res=[]
        count=1
        for i in range(len(val) - 1):
            if val[i+1] == val[i]:
                count += 1
            else:
                res.append(count)
                count = 1
        res.append(count)
        ret = pd.Series()
        if len(res) == 0:
            ret['mean_lx_'+col] = np.NaN
            ret['var_lx_'+col] = np.NaN
            ret['max_lx_'+col] = np.NaN
            ret['min_lx_'+col] = np.NaN
            ret['skew_lx_'+col] = np.NaN
            ret['kurt_lx_'+col] = np.NaN
            ret['mean_diff_lx_'+col] = np.NaN
            ret['var_diff_lx_'+col] = np.NaN
            ret['max_diff_lx_'+col] = np.NaN
            ret['min_diff_lx_'+col] = np.NaN
            ret['skew_diff_lx_'+col] = np.NaN
            ret['kurt_diff_lx_'+col] = np.NaN
            return ret
        ret['mean_lx_'+col] = np.mean(res)
        ret['var_lx_'+col] = np.var(res)
        ret['max_lx_'+col] = np.max(res)
        ret['min_lx_'+col] = np.min(res)
        ret['skew_lx_'+col] =  stats.skew(res)
        ret['kurt_lx_'+col] = stats.kurtosis(res)    
        diff_res = np.diff(res)            
        if len(diff_res) == 0:
            ret['mean_diff_lx_'+col] = np.NaN
            ret['var_diff_lx_'+col] = np.NaN
            ret['max_diff_lx_'+col] = np.NaN
            ret['min_diff_lx_'+col] = np.NaN
            ret['skew_diff_lx_'+col] = np.NaN
            ret['kurt_diff_lx_'+col] = np.NaN
            return ret
        ret['mean_diff_lx_'+col] = np.mean(diff_res)
        ret['var_diff_lx_'+col] = np.var(diff_res)
        ret['max_diff_lx_'+col] = np.max(diff_res)
        ret['min_diff_lx_'+col] = np.min(diff_res)
        ret['skew_diff_lx_'+col] = stats.skew(diff_res)
        ret['kurt_diff_lx_'+col] = stats.kurtosis(diff_res)
        return ret
    
    t = train_data[['file_id', col]]
    df = pd.merge(df, t.groupby('file_id')[col].apply(checknum).unstack().reset_index(), on=['file_id'], how='left')
    return df
    

In [16]:
df_train = get_lx(df_train, train_data, 'api')

df_train = get_lx(df_train, train_data, 'return_value')

df_train['nb_nan'] = df_train.isnull().sum(1)

names = ['0', '1', 'pos', 'neg']
for i in range(4):
    for j in range(i+1, 4):
        df_train['num_{}_{}_ret_val_ratio'.format(names[i], names[j])] = df_train['num_{}_ret_val'.format(names[i])] / df_train['num_{}_ret_val'.format(names[j])]
        df_train['num_{}_{}_ret_val_gap'.format(names[i], names[j])] = df_train['num_{}_ret_val'.format(names[i])] - df_train['num_{}_ret_val'.format(names[j])]

In [17]:
df_train.to_csv('./feature/df_train.csv', index=False)

In [18]:
train_data.api.nunique()

308

In [20]:
most_common_apis = Counter(train_data.api).most_common(10)

In [23]:
np.save('./input/most_common_apis.npy', most_common_apis)

In [21]:
def get_common_apis_ratio_num(df, data):
    t = data[['file_id', 'api']]
    t['num'] = 1
    t = t.groupby(['file_id', 'api']).agg('sum').reset_index()

    def cal_api_ratio(x):
        ret = pd.Series()
        for api in most_common_apis:
            api_name = api[0]
            res = x[x['api'] == api_name]['num'].values
            if len(res) > 0:
                ret['{}_num'.format(api_name)] = res[0]
            else:
                ret['{}_num'.format(api_name)] = 0
            ret['{}_ratio'.format(api_name)] = ret['{}_num'.format(api_name)] / len(x)
        return ret

    t = t.groupby('file_id').apply(cal_api_ratio).reset_index()
    df = pd.merge(df, t, on='file_id', how='left')
    return df

In [24]:
df_train = get_common_apis_ratio_num(df_train, train_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
df_train.to_csv('./feature/df_train.csv', index=False)

## TFIDF-特征

In [None]:
def get_tf_idf_v1(df_tr, df_te, tr_data, te_data):
    # 只考虑调用顺序 不考虑返回值
    tr = tr_data[['file_id', 'api']]
    te = te_data[['file_id', 'api']]

    def get_api_seq(x):
        apis = x['api'].values
        api_seq = ' '.join(apis)
        api_seq += '.'
        return api_seq

    t1 = tr.groupby('file_id').apply(get_api_seq).reset_index().rename(columns={0:'api_text'})
    t2 = te.groupby('file_id').apply(get_api_seq).reset_index().rename(columns={0:'api_text'})

    feature_extraction = TfidfVectorizer().fit(np.concatenate([t1["api_text"].values, t2["api_text"].values]))
    tr_tfidf_fea = feature_extraction.transform(t1["api_text"].values)
    te_tfidf_fea = feature_extraction.transform(t2["api_text"].values)

    df_tr = pd.concat([df_tr, pd.DataFrame(tr_tfidf_fea.toarray())], axis=1)
    df_te = pd.concat([df_te, pd.DataFrame(te_tfidf_fea.toarray())], axis=1)
    
    rename = {}

    for i in range(0, te_tfidf_fea.shape[1]):
        rename[i] = 'v1_{}'.format(i)

    df_tr.rename(columns=rename, inplace=True)
    df_te.rename(columns=rename, inplace=True)

    return df_tr, df_te

df_train, df_test = get_tf_idf_v1(df_train, df_test, train_data, test_data)

def get_tf_idf_v2(df_tr, df_te, tr_data, te_data):
    tr = tr_data[['file_id', 'api', 'return_value']]
    te = te_data[['file_id', 'api', 'return_value']]

    tr['ret_val'] = ''
    tr.loc[tr['return_value'] > 0, 'ret_val'] = 'pos'
    tr.loc[tr['return_value'] < 0, 'ret_val'] = 'neg'
    tr.loc[tr['return_value'] == 0, 'ret_val'] = 'zero'
    
    te['ret_val'] = ''
    te.loc[te['return_value'] > 0, 'ret_val'] = 'pos'
    te.loc[te['return_value'] < 0, 'ret_val'] = 'neg'
    te.loc[te['return_value'] == 0, 'ret_val'] = 'zero'
    
    tr['api_ret_val'] = tr['api'] + tr['ret_val']
    te['api_ret_val'] = te['api'] + te['ret_val']

    t1 = tr[['file_id', 'api_ret_val']]
    t2 = te[['file_id', 'api_ret_val']]

    def get_api_ret_val_seq(x):
        apis = x['api_ret_val'].values
        api_seq = ' '.join(apis)
        api_seq += '.'
        return api_seq

    t1 = t1.groupby('file_id').apply(get_api_ret_val_seq).reset_index().rename(columns={0:'api_ret_val_text'})
    t2 = t2.groupby('file_id').apply(get_api_ret_val_seq).reset_index().rename(columns={0:'api_ret_val_text'})

    feature_extraction = TfidfVectorizer().fit(np.concatenate([t1["api_ret_val_text"].values, t2["api_ret_val_text"].values]))
    
    tr_tfidf_fea = feature_extraction.transform(t1["api_ret_val_text"].values)
    te_tfidf_fea = feature_extraction.transform(t2["api_ret_val_text"].values)

    df_tr = pd.concat([df_tr, pd.DataFrame(tr_tfidf_fea.toarray())], axis=1)
    df_te = pd.concat([df_te, pd.DataFrame(te_tfidf_fea.toarray())], axis=1)

    rename = {}
    for i in range(0, tr_tfidf_fea.shape[1]):
        rename[i] = 'v2_{}'.format(i)

    df_tr.rename(columns=rename, inplace=True)
    df_te.rename(columns=rename, inplace=True)
    
    return df_tr, df_te

df_train, df_test = get_tf_idf_v2(df_train, df_test, train_data, test_data)

In [9]:
df_train.head()

Unnamed: 0,file_id,label,num_invoke,nunique_api,mean_num_api,max_num_api,min_num_api,var_num_api,skew_num_api,kurt_num_api,...,v2_613,v2_614,v2_615,v2_616,v2_617,v2_618,v2_619,v2_620,v2_621,v2_622
0,0,0,424,19,22.315789,99,1,708.672515,1.346982,1.368728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,5,2,2,1.0,1,1,0.0,,-3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,34,15,2.266667,14,1,11.352381,3.444457,8.277387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,5,2800,65,43.076923,1128,1,37224.415865,5.516723,27.213945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001445
4,4,5,6832,78,87.589744,2970,1,157324.06327,6.093128,37.151675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001176


In [5]:
from sklearn.preprocessing import LabelEncoder

apis = list(set(list(train_data.api.unique())+list(test_data.api.unique())))

enc = LabelEncoder().fit(apis)

train_data['enc'] = enc.transform(train_data.api)

test_data['enc'] = enc.transform(test_data.api)

## 下面这部分是DOC2VEC向量进行聚类

In [17]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from gensim.models.doc2vec import TaggedDocument


In [24]:
model = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=4)

In [28]:
data = t.enc.values

In [29]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data)]
# model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/message.c:4294)
KeyboardInterrupt


In [68]:
tr = train_data[['file_id', 'api', 'tid']]

def get_api_seq(x):
    apis = x['api'].values
    api_seq = ' '.join(apis)
    api_seq += '.'
    return api_seq

tr = tr.groupby(['file_id', 'tid']).apply(get_api_seq).reset_index().rename(columns={0:'api_text'})


In [69]:
te = test_data[['file_id', 'api', 'tid']]
te = te.groupby(['file_id', 'tid']).apply(get_api_seq).reset_index().rename(columns={0:'api_text'})

api_text = np.concatenate([tr.api_text.values, te.api_text.values])

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(api_text)]

In [38]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [72]:
model = Doc2Vec(documents, dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs=5, workers=cores)

In [73]:
doc_vec = model.docvecs.vectors_docs

In [74]:
doc_tr = pd.DataFrame(doc_vec[:tr.shape[0]])

In [75]:
doc_te = pd.DataFrame(doc_vec[tr.shape[0]:])

In [76]:
doc_tr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.286825,0.048865,0.061761,0.240027,-0.190398,0.12127,-0.366017,-0.335975,-0.037499,0.15538,...,-0.21975,0.148943,0.166747,0.239572,0.204719,-0.109604,0.102926,-0.176948,0.042451,0.066088
1,0.06843,0.02229,0.037476,0.080728,-0.071591,0.042092,-0.098902,0.096299,0.109431,0.047816,...,0.069886,-0.206756,-0.064712,0.174847,0.037625,0.106167,-0.44929,-0.138474,0.140562,-0.480538
2,0.060655,0.08661,0.24033,0.025454,-0.204714,-0.0099,-0.128736,-0.023981,-0.046222,0.119856,...,0.039363,-0.170435,-0.151556,0.054949,-0.068943,-0.047964,-0.165054,-0.123121,-0.012198,-0.181039
3,0.142252,-0.216748,-0.273913,-0.015159,0.001272,-0.16356,-0.111762,0.292721,-0.10813,0.326766,...,0.271479,-0.114769,0.382101,0.058515,-0.121215,-0.005351,-0.618075,0.234555,0.010397,-0.334225
4,0.302964,0.050127,-0.056714,0.102139,-0.173627,0.046486,-0.039381,-0.070086,-0.07659,0.035541,...,-0.061222,0.032373,-0.01007,-0.014053,0.064482,0.093705,-0.312032,-0.127941,0.036302,-0.025358


In [77]:
rename_dict = {}
for i in range(100):
    rename_dict[i] = 'doc_{}'.format(i)

In [78]:
doc_tr.rename(columns=rename_dict, inplace=True)
doc_te.rename(columns=rename_dict, inplace=True)

In [84]:
from sklearn.cluster import KMeans

In [85]:
kmeans = KMeans(n_clusters=6, n_jobs=4)

In [86]:
classes = kmeans.fit_predict(doc_vec)

### Kmens聚类 类别为6 稍微提升

In [87]:
tr['classes'] = classes[:tr.shape[0]]

te['classes'] = classes[tr.shape[0]: ]

In [107]:
def get_kmeans_fea(df, data):
    # 获取聚类后的统计特征
    t = data[['file_id', 'classes']]

    t = t.groupby(['file_id', 'classes']).size().reset_index()

    t.rename(columns={0: 'num'}, inplace=True)

    # 出现次数最多
    t1 = t.groupby('file_id').apply(lambda x: x['classes'].values[np.argsort(x['num'].values)[-1]]).reset_index().rename(columns={0: 'most_common_classes'})
    df = pd.merge(df, t1, on=['file_id'], how='left')
    
    for i in range(6):
        t = data[data.classes == i][['file_id', 'classes']]
        
        t = t.groupby(['file_id']).size().reset_index().rename(columns={0:'classes_{}_num'.format(i)})
        
        df = pd.merge(df, t, on='file_id', how='left')
        
    return df

In [108]:
df_train = get_kmeans_fea(df_train, tr)

In [112]:
def get_kmeans_ratio(df):
    #获取比例
    for i in range(6):
        df['classes_{}_ratio'.format(i)] = df['classes_{}_num'.format(i)] / df['nunique_tid']
    return df

In [113]:
df_train = get_kmeans_ratio(df_train)

In [120]:
df_test = get_kmeans_fea(df_test, te)
df_test = get_kmeans_ratio(df_test)

## 持久化

In [115]:
df_train.to_csv('feature/df_train.csv', index=False)

In [124]:
df_test.to_csv('feature/df_test.csv', index=False)

## LGB 预测

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train[cols], label, test_size=0.2, random_state=42)

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': {'multi_logloss'},
    'num_class': 6,
  'bagging_fraction': 0.8,
  'feature_fraction': 0.6,
  'nthread': 4,
  'lambda_l1': 1,
  'lambda_l2': 1
}

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=40000,
                valid_sets=lgb_eval,
                early_stopping_rounds=50,
                verbose_eval=10,
                )

# Training until validation scores don't improve for 50 rounds.
# [10]	valid_0's multi_logloss: 0.518999
# [20]	valid_0's multi_logloss: 0.203948
# [30]	valid_0's multi_logloss: 0.0937345
# [40]	valid_0's multi_logloss: 0.0523663
# [50]	valid_0's multi_logloss: 0.0360914
# [60]	valid_0's multi_logloss: 0.0291886
# [70]	valid_0's multi_logloss: 0.0255474
# [80]	valid_0's multi_logloss: 0.0239712
# [90]	valid_0's multi_logloss: 0.023102
# [100]	valid_0's multi_logloss: 0.0226112
# [110]	valid_0's multi_logloss: 0.0224444
# [120]	valid_0's multi_logloss: 0.0223155
# [130]	valid_0's multi_logloss: 0.0223462
# [140]	valid_0's multi_logloss: 0.0225303
# [150]	valid_0's multi_logloss: 0.0226152
# [160]	valid_0's multi_logloss: 0.0226824
# [170]	valid_0's multi_logloss: 0.0227195
# Early stopping, best iteration is:
# [122]	valid_0's multi_logloss: 0.0222961

In [None]:
y_pred = gbm.predict(X_val[cols])

np.save('y_pred_lgb_0222961.npy', y_pred)

y_sub = gbm.predict(df_test[cols])

In [None]:
sub = pd.read_csv('input/3rd_security_submit_sample.csv')

sub.loc[:, ['prob0','prob1','prob2','prob3','prob4','prob5']] = y_sub

sub = sub.round(7)

sub['sum'] = sub[['prob0','prob1','prob2','prob3','prob4','prob5']].sum(1)

sub['prob0'] = sub['prob0'] / sub['sum']
sub['prob1'] = sub['prob1'] / sub['sum']
sub['prob2'] = sub['prob2'] / sub['sum']
sub['prob3'] = sub['prob3'] / sub['sum']
sub['prob4'] = sub['prob4'] / sub['sum']
sub['prob5'] = sub['prob5'] / sub['sum']

sub.head()

sub[(sub[['prob0','prob1','prob2','prob3','prob4','prob5']].sum(1) - 1) >= 1e-6]

sub[['file_id', 'prob0','prob1','prob2','prob3','prob4','prob5']].to_csv('lgb_0222961.csv', index=False)