In [36]:
import time

import Levenshtein
from gensim import matutils
import pandas as pd
import numpy as np
import json
import re
import warnings
import os
import jieba
from sklearn.preprocessing import LabelEncoder
from operator import itemgetter
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold as skf
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics import f1_score
import gc

def get_data():
    if mode == "test":
        train = pd.read_table('/home/ccit/tkhoon/data/sdata_train.csv', header=None,
                              names=['prefix', 'query_prediction', 'title', 'tag', 'label', '1'], quoting=3)
        vali = pd.read_table('/home/ccit/tkhoon/data/sdata_vali.csv', header=None, names=['prefix', 'query_prediction', 'title', 'tag', 'label'],
                             quoting=3)
        test = pd.read_table('/home/ccit/tkhoon/data/sdata_test.csv', header=None, names=['prefix', 'query_prediction', 'title', 'tag', '1'],
                             quoting=3)
    else:
        train = pd.read_table('/home/ccit/tkhoon/data/data_train.csv', header=None,
                              names=['prefix', 'query_prediction', 'title', 'tag', 'label', '1'], quoting=3)
        vali = pd.read_table('/home/ccit/tkhoon/data/data_vali.csv', header=None, names=['prefix', 'query_prediction', 'title', 'tag', 'label'],
                             quoting=3)
        test = pd.read_table('/home/ccit/tkhoon/data/data_test.csv', header=None, names=['prefix', 'query_prediction', 'title', 'tag', '1'],
                             quoting=3)


    train_temp = train[train['1'].notnull()]
    test_temp = test[test['1'].notnull()]

    train_index = list(train_temp.index)
    test_index = list(test_temp.index)

    train.loc[train_index, 'tag'] = train.loc[train_index, 'label']
    train.loc[train_index, 'label'] = train.loc[train_index, '1']

    test.loc[test_index, 'tag'] = test.loc[test_index, '1']

    train.drop('1', axis=1, inplace=True)
    test.drop('1', axis=1, inplace=True)


    test['label'] = -1
    train['flag'] = 1
    vali['flag'] = 2
    test['flag'] = 3
    data = pd.concat([train, vali, test])
    data = data.reset_index()
    data.drop('index', axis=1, inplace=True)

    columns = ['prefix', 'query_prediction', 'title', 'tag']
    for column in columns:
        data[column] = data[column].astype(str)
    data.drop( data[ data["label"].isnull() ].index , inplace=True )
    data['label'] = data['label'].astype(int)
    return data

def char_process(char):
    # 提出无效字符
    try:
        char =  re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+??！，。？?、~@#￥%……&*（）:]+", "", char)
        return char
    except:
        return char

def is_prefix_contains_upper_english(data):
    judge = data['prefix'].apply(lambda x: len(re.findall("[A-Z]", x)) > 0 )
    # data[judge]
    data['is_prefix_contains_upper_english'] = 0
    data.loc[judge, 'is_prefix_contains_upper_english'] = 1
    return data

def char_lowwer_process(item):
        if (item['is_prefix_contains_upper_english'] == 1) & (item['query_prediction'] == 'nan'):
            return str.lower(item['prefix'])
        else :
            return item['prefix']

def title_char_lowwer_process(item):
        if (item['is_prefix_contains_upper_english'] == 1) & (item['query_prediction'] == 'nan'):
            return str.lower(item['title'])
        else :
            return item['title']

def char_cleaner(char):
    if not isinstance(char, str):
        char = "null"
    pattern = re.compile("[^0-9a-zA-Z\u4E00-\u9FA5 ]")
    char = re.sub(pattern, "", char)
    char = char.lower()
    return char

def is_prefix_contains_upper_english(data):
    judge = data['prefix'].apply(lambda x: len(re.findall("[A-Z]", x)) > 0 )
    # data[judge]
    data['is_prefix_contains_upper_english'] = 0
    data.loc[judge, 'is_prefix_contains_upper_english'] = 1
    return data

def query_process(item):
    try:
        item['query_prediction'] = json.loads(item['query_prediction'])
        return item['query_prediction']
    except:
        return '{}'

def combine_tag(item):
    if item['tag'] == '网页':
        return '网站'
    else:
        return item['tag']
complete_prefix_map={}
def get_complete_prefix(item):
        prefix = item['prefix']
        complete_prefix = complete_prefix_map.get(prefix  )
        if complete_prefix is not None:
            return  complete_prefix
        query_prediction = item['query_prediction']

        if query_prediction == '{}':
            return prefix

        predict_word_dict = dict()
        prefix = str(prefix)

        for query_item, query_ratio in query_prediction.items():
            query_item_cut = jieba.lcut(query_item)
            item_word = ""
            for item in query_item_cut:
                if prefix not in item_word:
                    item_word += item
                else:
                    if item_word not in predict_word_dict.keys():
                        predict_word_dict[item_word] = 0.0
                    predict_word_dict[item_word] += float(query_ratio)

        if not predict_word_dict:
            return prefix

        predict_word_dict = sorted(predict_word_dict.items(), key=itemgetter(1), reverse=True)
        complete_prefix = predict_word_dict[0][0]
        complete_prefix_map[ prefix ] = complete_prefix
        return complete_prefix

def run_process(data):

    data['prefix'] = data['prefix'].apply(char_process) #提出无效字符j
    data['title'] = data['title'].apply(char_process) #title也去掉，之后涉及到计算相似度问题
    data  = is_prefix_contains_upper_english(data) #判断prefix是否含有大写
    data['prefix'] = data.apply(char_lowwer_process,axis=1) #将含有大写的prefix转为小写    有转化成小写的 ,query一定为空
    data['title'] = data.apply(title_char_lowwer_process,axis=1)#把title也转换成小写
    data['tag'] = data.apply(combine_tag,axis=1) #合并tag
    return data

def get_prefix_query_dic(data):
    prefix_dic = {}
    for index,row in data.iterrows():
        if row['query_prediction'] != 'nan' and row['prefix'] not in prefix_dic:
            prefix_dic[row['prefix']] = row['query_prediction']
    return prefix_dic

def null_query_prediction_process(item):
    if item['query_prediction'] == 'nan' and item['prefix'] in prefix_dic:
        return prefix_dic[item['prefix']]
    else:
        return item['query_prediction']

def move_useless_char(s):
    # 提出无效字符
    return re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+??！，。？?、~@#￥%……&*（）:]+", "", s)

def query_prediction_text(query_prediction):
    if query_prediction == '{}':
        return ['PAD'];
    else:
        query_word = []
        for i in query_prediction.keys():
            query_word.append(i)
    #移除query_word无效字符
    for i in range(len(query_word)):
        query_word[i] = move_useless_char(query_word[i])
    return query_word

def query_prediction_score(query_prediction):
    if query_prediction == '{}':
        return np.nan
    else:
        query_score = []
        for i in query_prediction.values():
            query_score.append(float(i))
    return query_score

def get_query_list_feature(data):
    data['query_word'] = data['query_prediction'].apply(lambda x : query_prediction_text(x))
    data['query_score'] = data['query_prediction'].apply(lambda x: query_prediction_score(x))
    return data

def get_word_length(item):
    word_cut = jieba.lcut(item)
    return len(word_cut)

def title_is_in_query(item):
    if item['query_prediction'] == '{}' or item['title'] not in item['query_word']:
        return 0
    else:
        return 1

def prefix_is_in_title(item):
    if item['prefix'] == 'nan' or item['title'] == 'nan' or item['prefix'] not in item['title']:
        return 0
    else:
        return 1

def prefix_is_network(item):
    if 'www' in item or 'com' in item or 'http' in item:
        return 1

    else:
        return 0

def title_is_network(item):
    if 'www' in item or 'com' in item or 'http' in item:
        return 1
    else:
        return 0

def prefix_is_question(item):
    if '怎么' in item or '什么' in item or '哪' in item or '多少' in item or '谁' in item or '如何' in item:
        return 1
    else:
        return 0

def title_is_question(item):
    if '怎么' in item or '什么' in item or '哪' in item or '多少' in item or '谁' in item or '如何' in item:
        return 1
    else:
        return 0

def prefix_title_leve_dist(item):
    try:
        return Levenshtein.distance(item['prefix'], item['title'])
    except:
        return 0

def prefix_title_leve_rate(item):
    try:
        return Levenshtein.ratio(item['prefix'], item['title'])
    except:
        return 0
def get_word_w2v_model():
    w2v_model_name = "baike_26g_news_13g_novel_229g.bin"
    w2v_model_path = os.path.join("resources", w2v_model_name)
    w2v_model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=True, unicode_errors="ignore")
    return w2v_model

size = 100

def char_cleaner(char):
    if not isinstance(char, str):
        char = "null"
    pattern = re.compile("[^0-9a-zA-Z\u4E00-\u9FA5 ]")
    char = re.sub(pattern, "", char)
    char = char.lower()
    return char

def _get_jieba_array(words):
    words = char_cleaner(words)
    seg_cut = jieba.lcut(words)

    w2v_array = list()
    for word in seg_cut:
        try:
            similar_list = word_w2v_model[word]
            w2v_array.append(similar_list)
        except KeyError:
            continue

    if not w2v_array:
        w2v_array = [None] * size
    else:
        w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0))
        return w2v_array

def get_query_w2v_similar(item):
    item_dict = {}
    query_prediction = item['query_prediction']
    title = item['title']  # 等下再求下prefix得呢
    prefix = item['prefix']
    if query_prediction == '{}':
        item_dict['prefix_max_similar'] = None
        item_dict['prefix_mean_similar'] = None
        item_dict['prefix_weight_similar'] = None
        item_dict['title_max_similar'] = None
        item_dict['title_mean_similar'] = None
        item_dict['title_weight_similar'] = None
        return item_dict

    query_prediction = sorted(query_prediction.items(), key=itemgetter(1), reverse=True)
    query_prediction = query_prediction[:3]
    similar_list = []
    weight_similar_list = []
    title_array = _get_jieba_array(item['title'])
    prefix_array = _get_jieba_array(item['prefix'])

    for key, value in query_prediction:

        query_cut_array = _get_jieba_array(key)
        try:
            w2v_similar = np.dot(query_cut_array, title_array)
        except (KeyError, ZeroDivisionError, TypeError):
            w2v_similar = np.nan

        similar_list.append(w2v_similar)
        weight_w2v_similar = w2v_similar * float(value)
        weight_similar_list.append(weight_w2v_similar)

        max_similar = np.nanmax(similar_list)
        mean_similar = np.nanmean(similar_list)
        weight_similar = np.nansum(weight_similar_list)

        item_dict["title_max_similar"] = max_similar
        item_dict["title_mean_similar"] = mean_similar
        item_dict["title_weight_similar"] = weight_similar

    for key, value in query_prediction:

        query_cut_array = _get_jieba_array(key)
        try:
            w2v_similar = np.dot(query_cut_array, prefix_array)
        except (KeyError, ZeroDivisionError, TypeError):
            w2v_similar = np.nan

        similar_list.append(w2v_similar)
        weight_w2v_similar = w2v_similar * float(value)
        weight_similar_list.append(weight_w2v_similar)

        max_similar = np.nanmax(similar_list)
        mean_similar = np.nanmean(similar_list)
        weight_similar = np.nansum(weight_similar_list)

        item_dict["prefiix_max_similar"] = max_similar
        item_dict["prefix_mean_similar"] = mean_similar
        item_dict["prefix_weight_similar"] = weight_similar

        return item_dict

def get_prefix_w2v_similar(item):
    title = item['title']
    prefix = item['prefix']
    title_array = _get_jieba_array(item['title'])
    prefix_array = _get_jieba_array(item['prefix'])
    try:
        w2v_similar = np.dot(prefix_array, title_array)
    except (KeyError, ZeroDivisionError, TypeError):
        w2v_similar = np.nan
    return w2v_similar

# prefix也要加上
def get_query_sim_feature(data):
    start = time.time()

    data["item_dict"] = data.apply(get_query_w2v_similar, axis=1)

    data['prefix_title_sim'] = data.apply(get_prefix_w2v_similar, axis=1)

    print(start - time.time())
    return data

def kmeans(data):
    from sklearn.cluster import KMeans
    features = ['prefix','title','tag']
    for feature in features:
        data[feature] = LabelEncoder().fit_transform(data[feature])
    columns = ['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score']
    data_ = data.drop(columns,axis=1)
    kmeans = KMeans(n_clusters=25,init='k-means++',max_iter=300,verbose=1,n_jobs=-1)
    a = kmeans.fit_predict(data_)
    data['kmeans'] = a
    return data

def co_feature_del(data):
    threshold = 0.99
    # Absolute value correlation matrix
    corr_matrix = data.corr().abs()
    # corr_matrix.head()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # upper.head()
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print('There are %d columns to remove.' % (len(to_drop)))
    data.drop(to_drop, axis=1, inplace=True)

    return data

def pre_data(data):
    # 先看一下，没有加入作为特征得
    train = data[data['flag'] == 1]
    vali = data[data['flag'] == 2]
    test = data[data['flag'] == 3]

    train_X_data = train.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                              axis=1)
    vali_X_data = vali.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                            axis=1)
    test_X_data = test.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                            axis=1)
    train_Y_data = train['label']
    vali_label = vali['label']

    return train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label

def lgb_test(train_X_data,vali_X_data,train_Y_data,vali_label):

    predict = []
    clf = lgb.LGBMClassifier(
    boosting_type='gbdt', subsample=1, colsample_bytree=1,
    max_depth=-1, n_estimators=10000, objective='binary',min_child_weight = 10,
    subsample_freq=1, num_leaves=127, reg_alpha=0,reg_lambda = 1.3,
    random_state=2018, n_jobs=-1, learning_rate=0.1)

    clf.fit(train_X_data, train_Y_data,eval_set=[(train_X_data,train_Y_data),(vali_X_data,vali_label)], eval_metric='logloss',verbose = 50, early_stopping_rounds=100)
    predict = clf.predict_proba(vali_X_data,num_iteration=clf.best_iteration_)
    return predict,clf

def find_best_thr(predict,vali_label):
    max = 0.0
    max_i =0.0
    predict = pd.DataFrame(predict)
    predict = predict[1]
    predict = pd.DataFrame(predict)
    for i in np.arange(0.25, 0.4500, 0.001):
        f1 = f1_score(vali_label, predict[1].map(lambda x: 0 if x < i else 1))
        if (f1 > max):
            max = f1_score(vali_label, predict[1].map(lambda x: 0 if x <= i else 1))
            max_i = i
    print('最大f1为', max)
    print('此时阈值为:', max_i)

    return max,max_i

def get_feature0(item):
    try:
        return item['item_dict']['title_max_similar']
    except:
        return np.nan


def get_feature1(item):
    try:
        return item['item_dict']['title_mean_similar']
    except:
        return np.nan


def get_feature2(item):
    try:
        return item['item_dict']['title_weight_similar']
    except:
        return np.nan


def get_feature3(item):
    try:
        return item['item_dict']['prefiix_max_similar']
    except:
        return np.nan


def get_feature4(item):
    try:
        return item['item_dict']['prefix_mean_similar']
    except:
        return np.nan


def get_feature5(item):
    try:
        return item['item_dict']['prefix_weight_similar']
    except:
        return np.nan


def get_feature6(item):
    try:
        return max(item['query_score'])
    except:
        return np.nan


def get_feature7(item):
    try:
        return min(item['query_score'])
    except:
        return np.nan


def get_feature8(item):
    try:
        return np.mean(item['query_score'])
    except:
        return np.nan

mode = "train"
data = get_data()
data = run_process(data)
prefix_dic = get_prefix_query_dic(data)
data['query_prediction'] = data.apply(null_query_prediction_process,axis=1)
data['query_prediction'] = data.apply(query_process,axis=1)
data['prefix'] = data.apply(get_complete_prefix,axis=1)  #把complete_prefix也当作基础特征
data = get_query_list_feature(data)
print(1)
data['prefix_word_len'] = data['prefix'].apply(lambda x: get_word_length(x))
data['title_word_len'] = data['title'].apply(lambda x: get_word_length(x))
data['title-prefix_word_len'] = data['title_word_len'] - data['prefix_word_len']
data['prefix_is_question'] = data['prefix'].apply(lambda x: prefix_is_question(x))
data['title_is_question'] = data['title'].apply(lambda x: title_is_question(x))
data['title_is_network'] = data['title'].apply(lambda x: title_is_network(x))
data['prefix_is_network'] = data['prefix'].apply(lambda x: prefix_is_network(x))
data['prefix_title_leve_dist'] = data.apply(prefix_title_leve_dist, axis=1)
data['prefix_title_leve_rate'] = data.apply(prefix_title_leve_rate, axis=1)
print(2)
data['query_prediction_len'] = data['query_word'].apply(lambda x: len(x))
data['prefix_len'] = data['prefix'].apply(lambda x: len(x))
data['title_len'] = data['title'].apply(lambda x: len(x))
data['title-prefix_len'] = data['title_len'] - data['prefix_len']
data['title_is_in_query'] = data.apply(title_is_in_query, axis=1)
data['is_prefix_in_title'] = data.apply(prefix_is_in_title, axis=1)
print('start_w2v_feature')
word_w2v_model = get_word_w2v_model()
data = get_query_sim_feature(data)

data['title_max_similar'] = data.apply(get_feature0, axis=1)
data['title_mean_similar'] = data.apply(get_feature1, axis=1)
data['title_weight_similar'] = data.apply(get_feature2, axis=1)
data['prefix_max_similar'] = data.apply(get_feature3, axis=1)
data['prefix_mean_similar'] = data.apply(get_feature4, axis=1)
data['prefix_weight_similar'] = data.apply(get_feature5, axis=1)
data['query_score_max'] = data.apply(get_feature6, axis=1)
data['query_score_min'] = data.apply(get_feature7, axis=1)
data['query_score_mean'] = data.apply(get_feature8, axis=1)

print('basic_feature_finish')

columns = ['title-prefix_len',
       'query_score_max', 'query_score_mean', 'query_score_min',
       'title_is_in_query', 'is_prefix_in_title'
       , 'title_max_similar',
       'title_mean_similar', 'title_weight_similar','prefix_title_sim','prefix_weight_similar','prefix_mean_similar','prefix_max_similar']
for column in columns:
    data[column] = data[column].fillna(data[column].mean())


data.to_csv("/home/ccit/tkhoon/liuyunEtlData/feature.csv",index=False)

print('baisc_feature_saved')
data['prefix_title_sim_bin'] = pd.qcut(data['prefix_title_sim'],8)
data['prefix_title_sim_bin'] = pd.factorize(data['prefix_title_sim_bin'])[0]

data['title_weight_similar_bin'] = pd.qcut(data['title_weight_similar'],10)
data['title_weight_similar_bin'] = pd.factorize(data['title_weight_similar_bin'])[0]

data['title_mean_similar_bin'] = pd.qcut(data['title_mean_similar'],5)
data['title_mean_similar_bin'] = pd.factorize(data['title_mean_similar_bin'])[0]

data['title_max_similar_bin'] = pd.qcut(data['title_max_similar'],5)
data['title_max_similar_bin']  = pd.factorize(data['title_max_similar_bin'])[0]

data['prefix_weight_similar_bin'] = pd.qcut(data['prefix_weight_similar'],10)
data['prefix_weight_similar_bin'] = pd.factorize(data['prefix_weight_similar_bin'])[0]

data['prefix_mean_similar_bin'] = pd.qcut(data['prefix_mean_similar'],5)
data['prefix_mean_similar_bin'] = pd.factorize(data['prefix_mean_similar_bin'])[0]

data['prefix_max_similar_bin'] = pd.qcut(data['prefix_max_similar'],3)
data['prefix_max_similar_bin']  = pd.factorize(data['prefix_max_similar_bin'])[0]

print('start_kmeans')
data = kmeans(data)
print('start_run_feature')


features = ['prefix', 'title', 'tag','is_prefix_in_title','title_is_in_query'
            ,'prefix_word_len','title_word_len','prefix_is_question','prefix_is_network','query_score_mean','title_weight_similar'
           ,'query_prediction_len','prefix_len','title_len','prefix_weight_similar']
for feature in features:
    a = data[feature].value_counts().to_dict()
    data[feature+'_count'] = data[feature].apply(lambda x:a[x])
    del a
    gc.collect()
    print(feature)
print('part_1_finish')
gc.collect()

for  i in range(len(features)):
    for j in range(i+1,len(features)):
        new_feature = features[i]+'_'+features[j]
        data[new_feature] = data[features[i]].astype(str) + '_' + data[features[j]].astype(str)
        data[new_feature] = LabelEncoder().fit_transform(data[new_feature])
        new_feature_count = new_feature + '_count'
        a = data[new_feature].value_counts().to_dict()
        data[new_feature_count] = data[new_feature].apply(lambda x : a[x])
        gc.collect()
        del a
        print(i)
print('part2_finish')
gc.collect()

pos_features = ['title_weight_similar_bin', 'title_mean_similar_bin','title_max_similar_bin','prefix_title_sim_bin','query_prediction_len','prefix_len','title_len',
                'title-prefix_len','prefix_word_len','title_word_len','title-prefix_word_len','prefix_max_similar_bin','prefix_mean_similar_bin',
               'prefix_weight_similar_bin']
for feature in pos_features:
    train = data[(data['flag'] == 1)]
    temp = train.groupby(feature,as_index=False)['label'].agg({feature+'_click_':'sum',feature+'_count_':'count'})
    temp[feature+'_ctr_'] = temp[feature+'_click_'] / (temp[feature+'_count_']+5)
    data = pd.merge(data,temp,on=feature,how='left')
    del train
    del temp
    gc.collect()
    print(feature)
print('part3_finish')

features = ['prefix','title','tag']
for feature in features:
    data[feature] = LabelEncoder().fit_transform(data[feature])


print('feature_del')
data = co_feature_del(data)
print('lgb_test')
train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label = pre_data(data)
predict,clf = lgb_test(train_X_data,vali_X_data,train_Y_data,vali_label)

best_f1,best_thr = find_best_thr(predict,vali_label)

print('best_f1:',best_f1)
print('best_thr:',best_thr)

submit = clf.predict_proba(test_X_data,num_iteration=clf.best_iteration_)
submit = pd.DataFrame(submit)
submit.to_csv("/home/ccit/tkhoon/liuyunEtlData/feature.csv",index=False)

start_run_feature
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
14
14
14
14
14
14
14
15
15
15
15
15
15
16
16
16
16
16
17
17
17
17
18
18
18
19
19
20
part2_finish


MemoryError: 

# duqu shuju banben

In [5]:
features = ['prefix','title','tag']
for feature in features:
    data[feature] = LabelEncoder().fit_transform(data[feature])
columns = ['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score']
data_ = data.drop(columns,axis=1)

In [None]:
import time

import Levenshtein
from gensim import matutils
import pandas as pd
import numpy as np
import json
import re
import warnings
import os
import jieba
from sklearn.preprocessing import LabelEncoder
from operator import itemgetter
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold as skf
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics import f1_score
import gc

def kmeans(data):
    from sklearn.cluster import KMeans
    features = ['prefix','title','tag']
    for feature in features:
        data[feature] = LabelEncoder().fit_transform(data[feature])
    columns = ['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score']
    data_ = data.drop(columns,axis=1)
    kmeans = KMeans(n_clusters=25,init='k-means++',max_iter=300,verbose=1,n_jobs=-1)
    a = kmeans.fit_predict(data_)
    data['kmeans'] = a
    del data_
    return data

def co_feature_del(data):
    threshold = 0.99
    # Absolute value correlation matrix
    corr_matrix = data.corr().abs()
    # corr_matrix.head()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # upper.head()
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print('There are %d columns to remove.' % (len(to_drop)))
    data.drop(to_drop, axis=1, inplace=True)
    
    del upper
    del to_drop
    del corr_matrix
    return data

def pre_data(data):
    # 先看一下，没有加入作为特征得
    train = data[data['flag'] == 1]
    vali = data[data['flag'] == 2]
    test = data[data['flag'] == 3]

    train_X_data = train.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                              axis=1)
    vali_X_data = vali.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                            axis=1)
    test_X_data = test.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                            axis=1)
    train_Y_data = train['label']
    vali_label = vali['label']
    
    del train
    del vali
    del test
    
    return train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label

def lgb_test(train_X_data,vali_X_data,train_Y_data,vali_label):

    predict = []
    clf = lgb.LGBMClassifier(
    boosting_type='gbdt', subsample=1, colsample_bytree=1,
    max_depth=-1, n_estimators=10000, objective='binary',min_child_weight = 10,
    subsample_freq=1, num_leaves=127, reg_alpha=0,reg_lambda = 1.3,
    random_state=2018, n_jobs=-1, learning_rate=0.1)

    clf.fit(train_X_data, train_Y_data,eval_set=[(train_X_data,train_Y_data),(vali_X_data,vali_label)], eval_metric='logloss',verbose = 50, early_stopping_rounds=100)
    predict = clf.predict_proba(vali_X_data,num_iteration=clf.best_iteration_)
    return predict,clf

def find_best_thr(predict,vali_label):
    max = 0.0
    max_i =0.0
    predict = pd.DataFrame(predict)
    predict = predict[1]
    predict = pd.DataFrame(predict)
    for i in np.arange(0.25, 0.4500, 0.001):
        f1 = f1_score(vali_label, predict[1].map(lambda x: 0 if x < i else 1))
        if (f1 > max):
            max = f1_score(vali_label, predict[1].map(lambda x: 0 if x <= i else 1))
            max_i = i
    print('最大f1为', max)
    print('此时阈值为:', max_i)

    return max,max_i

data = pd.read_csv("/home/ccit/tkhoon/liuyunEtlData/feature.csv")
data['prefix'] = data['prefix'].astype(str)
data['title'] = data['title'].astype(str)
data['prefix_max_similar'] = data['prefix_max_similar'].fillna(data['prefix_max_similar'].mean())
data['prefix_title_sim_bin'] = pd.qcut(data['prefix_title_sim'],8)
data['prefix_title_sim_bin'] = pd.factorize(data['prefix_title_sim_bin'])[0]

data['title_weight_similar_bin'] = pd.qcut(data['title_weight_similar'],10)
data['title_weight_similar_bin'] = pd.factorize(data['title_weight_similar_bin'])[0]

data['title_mean_similar_bin'] = pd.qcut(data['title_mean_similar'],5)
data['title_mean_similar_bin'] = pd.factorize(data['title_mean_similar_bin'])[0]

data['title_max_similar_bin'] = pd.qcut(data['title_max_similar'],5)
data['title_max_similar_bin']  = pd.factorize(data['title_max_similar_bin'])[0]

data['prefix_weight_similar_bin'] = pd.qcut(data['prefix_weight_similar'],10)
data['prefix_weight_similar_bin'] = pd.factorize(data['prefix_weight_similar_bin'])[0]

data['prefix_mean_similar_bin'] = pd.qcut(data['prefix_mean_similar'],5)
data['prefix_mean_similar_bin'] = pd.factorize(data['prefix_mean_similar_bin'])[0]

data['prefix_max_similar_bin'] = pd.qcut(data['prefix_max_similar'],3)
data['prefix_max_similar_bin']  = pd.factorize(data['prefix_max_similar_bin'])[0]
print('run_kmeans')
data = kmeans(data)
print('run_feature')
gc.collect()

features = ['kmeans','prefix', 'title', 'tag','is_prefix_in_title','title_is_in_query'
            ,'prefix_word_len','title_word_len','prefix_is_question','prefix_is_network','query_score_mean','title_weight_similar'
           ,'query_prediction_len','prefix_len','title_len','prefix_weight_similar']
for feature in features:
    a = data[feature].value_counts().to_dict()
    data[feature+'_count'] = data[feature].apply(lambda x:a[x])
    del a
    gc.collect()
    print(feature)
print('part_1_finish')
gc.collect()

for  i in range(len(features)):
    for j in range(i+1,len(features)):
        new_feature = features[i]+'_'+features[j]
        data[new_feature] = data[features[i]].astype(str) + '_' + data[features[j]].astype(str)
        data[new_feature] = LabelEncoder().fit_transform(data[new_feature])
        new_feature_count = new_feature + '_count'
        a = data[new_feature].value_counts().to_dict()
        data[new_feature_count] = data[new_feature].apply(lambda x : a[x])
        gc.collect()
        del a
        print(i)
print('part2_finish')
gc.collect()

pos_features = ['kmeans','title_weight_similar_bin', 'title_mean_similar_bin','title_max_similar_bin','prefix_title_sim_bin','query_prediction_len','prefix_len','title_len',
                'title-prefix_len','prefix_word_len','title_word_len','title-prefix_word_len','prefix_max_similar_bin','prefix_mean_similar_bin',
               'prefix_weight_similar_bin']
for feature in pos_features:
    train = data[(data['flag'] == 1)]
    temp = train.groupby(feature,as_index=False)['label'].agg({feature+'_click_':'sum',feature+'_count':'count'})
    data = pd.merge(data,temp,on=feature,how='left')
    del train
    del temp
    gc.collect()
    print(feature)
print('part3_finish')

features = ['prefix','title','tag']
for feature in features:
    data[feature] = LabelEncoder().fit_transform(data[feature])

# data.to_csv("/home/ccit/tkhoon/liuyunEtlData/total_feature.csv",index=False)
print('run_co_feature_del')
data = co_feature_del(data)
gc.collect()
train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label = pre_data(data)
print('run_lgb_test')
predict,clf = lgb_test(train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label)

run_kmeans
Initialization complete
start iteration
done sorting
end inner loop
Initialization complete
Iteration 0, inertia 3515112098289364.5
start iteration
done sorting
start iteration
done sorting
Initialization complete
end inner loop
end inner loop
start iteration
done sorting
end inner loop
Initialization complete
Iteration 0, inertia 3700553427652613.0
start iteration
done sorting
end inner loop
Iteration 1, inertia 3281626668368010.0
start iteration
done sorting
start iteration
done sorting
Initialization complete
end inner loop
end inner loop
Iteration 0, inertia 3598559032286564.0
start iteration
done sorting
end inner loop
start iteration
done sorting
end inner loop
Iteration 1, inertia 3399592215004150.0
start iteration
done sorting
end inner loop
Iteration 0, inertia 3483082362822402.0
start iteration
done sorting
Initialization complete
Iteration 2, inertia 3217975337403734.5
start iteration
done sorting
end inner loop
end inner loop
Iteration 1, inertia 3313006297703744

start iteration
done sorting
end inner loop
Iteration 15, inertia 3113279583268071.5
start iteration
done sorting
end inner loop
Iteration 16, inertia 3211207099442874.0
start iteration
done sorting
Iteration 14, inertia 3180683813337076.0
start iteration
done sorting
Iteration 3, inertia 3245652252888933.5
start iteration
end inner loop
done sorting
Iteration 7, inertia 3159389692451440.0
start iteration
done sorting
end inner loop
Iteration 2, inertia 3291470289456611.0
start iteration
done sorting
end inner loop
end inner loop
Iteration 15, inertia 3184617044078598.0
start iteration
done sorting
end inner loop
end inner loop
Iteration 11, inertia 3201054136117692.5
start iteration
done sorting
end inner loop
Iteration 16, inertia 3108239873553803.0
start iteration
done sorting
end inner loop
Iteration 17, inertia 3205441552747970.5
start iteration
done sorting
Iteration 15, inertia 3178995437371760.5
start iteration
done sorting
end inner loop
end inner loop
Iteration 16, inertia 31

start iteration
done sorting
end inner loop
Iteration 32, inertia 3163763668851623.0
start iteration
done sorting
end inner loop
end inner loop
Iteration 30, inertia 3171974455800034.0
start iteration
done sorting
end inner loop
Iteration 31, inertia 3176461057126718.5
start iteration
done sorting
end inner loop
Iteration 32, inertia 3088847375929864.0
start iteration
done sorting
end inner loop
Iteration 33, inertia 3163300140482900.5
start iteration
done sorting
Iteration 7, inertia 3198444774840773.5
start iteration
done sorting
end inner loop
Iteration 31, inertia 3171626519784121.0
start iteration
done sorting
end inner loop
end inner loop
Iteration 32, inertia 3175865593188822.5
start iteration
done sorting
end inner loop
Iteration 8, inertia 3219347973724403.0
start iteration
done sorting
end inner loop
Iteration 12, inertia 3147809529554449.0
start iteration
done sorting
end inner loop
Iteration 33, inertia 3088738633009872.0
start iteration
done sorting
end inner loop
Iteratio

end inner loop
end inner loop
Iteration 15, inertia 3179921422605791.5
start iteration
done sorting
end inner loop
Iteration 49, inertia 3146442481724631.0
start iteration
done sorting
end inner loop
Iteration 13, inertia 3225879442874979.0
start iteration
done sorting
Iteration 14, inertia 3068206456485170.5
start iteration
done sorting
end inner loop
Iteration 51, inertia 3160018169762413.0
start iteration
done sorting
end inner loop
end inner loop
Iteration 49, inertia 3167311651305387.0
start iteration
done sorting
end inner loop
Iteration 16, inertia 3179135781481431.0
start iteration
done sorting
end inner loop
Iteration 50, inertia 3143656811537468.0
start iteration
done sorting
end inner loop
Iteration 52, inertia 3159895531477936.0
start iteration
done sorting
end inner loop
Iteration 50, inertia 3167035733750926.0
start iteration
done sorting
end inner loop
Iteration 17, inertia 3178298242672606.0
start iteration
done sorting
Iteration 18, inertia 3143186189113356.5
start ite

end inner loop
Iteration 68, inertia 3136537549153962.0
center shift 2.394081e+02 within tolerance 9.643441e+04
Iteration 22, inertia 3065798270152123.5
start iteration
done sorting
Iteration 21, inertia 3208056259978383.0
start iteration
done sorting
end inner loop
Iteration 33, inertia 3164524838051173.0
start iteration
done sorting
end inner loop
end inner loop
Iteration 66, inertia 3132894982512498.0
start iteration
done sorting
end inner loop
Iteration 22, inertia 3174856276558729.0
start iteration
done sorting
end inner loop
Iteration 34, inertia 3164494294986230.5
start iteration
done sorting
end inner loop
Iteration 67, inertia 3132823910943393.0
start iteration
done sorting
end inner loop
Iteration 23, inertia 3065785456780789.5
start iteration
done sorting
Iteration 22, inertia 3204569779336109.5
start iteration
done sorting
end inner loop
end inner loop
Iteration 35, inertia 3164379885412645.0
start iteration
done sorting
end inner loop
Iteration 23, inertia 3171057413556334

In [2]:
import time

import Levenshtein
from gensim import matutils
import pandas as pd
import numpy as np
import json
import re
import warnings
import os
import jieba
from sklearn.preprocessing import LabelEncoder
from operator import itemgetter
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold as skf
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics import f1_score
import gc

def kmeans(data):
    from sklearn.cluster import KMeans
    features = ['prefix','title','tag']
    for feature in features:
        data[feature] = LabelEncoder().fit_transform(data[feature])
    columns = ['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score']
    data_ = data.drop(columns,axis=1)
    kmeans = KMeans(n_clusters=25,init='k-means++',max_iter=300,verbose=1,n_jobs=-1)
    a = kmeans.fit_predict(data_)
    data['kmeans'] = a
    del data_
    return data

def co_feature_del(data):
    threshold = 0.99
    # Absolute value correlation matrix
    corr_matrix = data.corr().abs()
    # corr_matrix.head()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # upper.head()
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print('There are %d columns to remove.' % (len(to_drop)))
    data.drop(to_drop, axis=1, inplace=True)
    
    del upper
    del to_drop
    del corr_matrix
    return data

def pre_data(data):
    # 先看一下，没有加入作为特征得
    train = data[data['flag'] == 1]
    vali = data[data['flag'] == 2]
    test = data[data['flag'] == 3]

    train_X_data = train.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                              axis=1)
    vali_X_data = vali.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                            axis=1)
    test_X_data = test.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                            axis=1)
    train_Y_data = train['label']
    vali_label = vali['label']
    
    del train
    del vali
    del test
    del data
    return train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label

def lgb_test(train_X_data,vali_X_data,train_Y_data,vali_label):

    predict = []
    clf = lgb.LGBMClassifier(
    boosting_type='gbdt', subsample=1, colsample_bytree=1,
    max_depth=-1, n_estimators=10000, objective='binary',min_child_weight = 10,
    subsample_freq=1, num_leaves=127, reg_alpha=0,reg_lambda = 1.3,
    random_state=2018, n_jobs=-1, learning_rate=0.1)

    clf.fit(train_X_data, train_Y_data,eval_set=[(train_X_data,train_Y_data),(vali_X_data,vali_label)], eval_metric='logloss',verbose = 50, early_stopping_rounds=100)
    predict = clf.predict_proba(vali_X_data,num_iteration=clf.best_iteration_)
    return predict,clf

def find_best_thr(predict,vali_label):
    max = 0.0
    max_i =0.0
    predict = pd.DataFrame(predict)
    predict = predict[1]
    predict = pd.DataFrame(predict)
    for i in np.arange(0.25, 0.4500, 0.001):
        f1 = f1_score(vali_label, predict[1].map(lambda x: 0 if x < i else 1))
        if (f1 > max):
            max = f1_score(vali_label, predict[1].map(lambda x: 0 if x <= i else 1))
            max_i = i
    print('最大f1为', max)
    print('此时阈值为:', max_i)

    return max,max_i

data = pd.read_csv("/home/ccit/tkhoon/liuyunEtlData/feature.csv")
data['prefix'] = data['prefix'].astype(str)
data['title'] = data['title'].astype(str)
data['prefix_max_similar'] = data['prefix_max_similar'].fillna(data['prefix_max_similar'].mean())
data['prefix_title_sim_bin'] = pd.qcut(data['prefix_title_sim'],8)
data['prefix_title_sim_bin'] = pd.factorize(data['prefix_title_sim_bin'])[0]

data['title_weight_similar_bin'] = pd.qcut(data['title_weight_similar'],10)
data['title_weight_similar_bin'] = pd.factorize(data['title_weight_similar_bin'])[0]

data['title_mean_similar_bin'] = pd.qcut(data['title_mean_similar'],5)
data['title_mean_similar_bin'] = pd.factorize(data['title_mean_similar_bin'])[0]

data['title_max_similar_bin'] = pd.qcut(data['title_max_similar'],5)
data['title_max_similar_bin']  = pd.factorize(data['title_max_similar_bin'])[0]

data['prefix_weight_similar_bin'] = pd.qcut(data['prefix_weight_similar'],10)
data['prefix_weight_similar_bin'] = pd.factorize(data['prefix_weight_similar_bin'])[0]

data['prefix_mean_similar_bin'] = pd.qcut(data['prefix_mean_similar'],5)
data['prefix_mean_similar_bin'] = pd.factorize(data['prefix_mean_similar_bin'])[0]

data['prefix_max_similar_bin'] = pd.qcut(data['prefix_max_similar'],3)
data['prefix_max_similar_bin']  = pd.factorize(data['prefix_max_similar_bin'])[0]

print('run_kmeans')
data = kmeans(data)
print('run_feature')
gc.collect()

features = ['kmeans','prefix', 'title', 'tag','is_prefix_in_title','title_is_in_query'
            ,'prefix_word_len','title_word_len','prefix_is_question',
            'prefix_is_network','query_score_mean','title_weight_similar']

for feature in features:
    a = data[feature].value_counts().to_dict()
    data[feature+'_count'] = data[feature].apply(lambda x:a[x])
    del a
    gc.collect()
    print(feature)
print('part_1_finish')
gc.collect()

for  i in range(len(features)):
    for j in range(i+1,len(features)):
        new_feature = features[i]+'_'+features[j]
        data[new_feature] = data[features[i]].astype(str) + '_' + data[features[j]].astype(str)
        data[new_feature] = LabelEncoder().fit_transform(data[new_feature])
        new_feature_count = new_feature + '_count'
        a = data[new_feature].value_counts().to_dict()
        data[new_feature_count] = data[new_feature].apply(lambda x : a[x])
        gc.collect()
        del a
        print(i)
print('part2_finish')
gc.collect()

pos_features = ['kmeans','title_weight_similar_bin', 'title_mean_similar_bin','title_max_similar_bin','prefix_title_sim_bin','query_prediction_len','prefix_len','title_len',
                'title-prefix_len','prefix_word_len','title_word_len','title-prefix_word_len','prefix_max_similar_bin','prefix_mean_similar_bin',
               'prefix_weight_similar_bin']
for feature in pos_features:
    train = data[(data['flag'] == 1)]
    temp = train.groupby(feature,as_index=False)['label'].agg({feature+'_click_':'sum'})
#     temp[feature+'_ctr_'] = temp[feature+'_click_'] / (temp[feature+'_count'] + 5)
    data = pd.merge(data,temp,on=feature,how='left')
    del train
    del temp
    gc.collect()
    print(feature)
print('part3_finish')

features = ['prefix','title','tag']
for feature in features:
    data[feature] = LabelEncoder().fit_transform(data[feature])

# data.to_csv("/home/ccit/tkhoon/liuyunEtlData/total_feature.csv",index=False)
print('run_co_feature_del')
data = co_feature_del(data)
gc.collect()
train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label = pre_data(data)
print('run_lgb_test')
predict,clf = lgb_test(train_X_data,vali_X_data,train_Y_data,vali_label)

best_f1,best_thr = find_best_thr(predict,vali_label)

print('best_f1:',best_f1)
print('best_thr:',best_thr)

run_lgb_test
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.504337	valid_1's binary_logloss: 0.516818
[100]	training's binary_logloss: 0.485752	valid_1's binary_logloss: 0.501558
[150]	training's binary_logloss: 0.476202	valid_1's binary_logloss: 0.494058
[200]	training's binary_logloss: 0.469395	valid_1's binary_logloss: 0.489064
[250]	training's binary_logloss: 0.464152	valid_1's binary_logloss: 0.485088
[300]	training's binary_logloss: 0.459692	valid_1's binary_logloss: 0.481931
[350]	training's binary_logloss: 0.455959	valid_1's binary_logloss: 0.479275
[400]	training's binary_logloss: 0.452784	valid_1's binary_logloss: 0.477211
[450]	training's binary_logloss: 0.450178	valid_1's binary_logloss: 0.475447
[500]	training's binary_logloss: 0.447757	valid_1's binary_logloss: 0.473622
[550]	training's binary_logloss: 0.44562	valid_1's binary_logloss: 0.472192
[600]	training's binary_logloss: 0.44353	valid_1's binary_logloss: 0.470901
[65

[5300]	training's binary_logloss: 0.375745	valid_1's binary_logloss: 0.440506
[5350]	training's binary_logloss: 0.375365	valid_1's binary_logloss: 0.440392
[5400]	training's binary_logloss: 0.374982	valid_1's binary_logloss: 0.440281
[5450]	training's binary_logloss: 0.374641	valid_1's binary_logloss: 0.440218
[5500]	training's binary_logloss: 0.374255	valid_1's binary_logloss: 0.440113
[5550]	training's binary_logloss: 0.373892	valid_1's binary_logloss: 0.439975
[5600]	training's binary_logloss: 0.373529	valid_1's binary_logloss: 0.439927
[5650]	training's binary_logloss: 0.373159	valid_1's binary_logloss: 0.439879
[5700]	training's binary_logloss: 0.372826	valid_1's binary_logloss: 0.439754
[5750]	training's binary_logloss: 0.372457	valid_1's binary_logloss: 0.439692
[5800]	training's binary_logloss: 0.372136	valid_1's binary_logloss: 0.439686
[5850]	training's binary_logloss: 0.371793	valid_1's binary_logloss: 0.439575
[5900]	training's binary_logloss: 0.371488	valid_1's binary_logl

In [23]:
submit = pd.DataFrame(clf.predict_proba(test_X_data,num_iteration=clf.best_iteration_))[1]
submit = submit.map(lambda x: 0 if x <=  0.39900000000000013 else 1)
submit.to_csv('/home/ccit/tkhoon/liuyunEtlData/submit7387.csv',index=False,encoding='utf-8',header=None)

# 5kflod

In [34]:
def find_best_thr(predict,vali_label):
    max = 0.0
    max_i =0.0
    predict = pd.DataFrame(predict)
    predict = predict[1]
    predict = pd.DataFrame(predict)
    for i in np.arange(0.25, 0.4500, 0.001):
        f1 = f1_score(vali_label, predict[1].map(lambda x: 0 if x < i else 1))
        if (f1 > max):
            max = f1_score(vali_label, predict[1].map(lambda x: 0 if x <= i else 1))
            max_i = i
    print('最大f1为', max)
    print('此时阈值为:', max_i)

    return max,max_i
from  sklearn.model_selection import StratifiedKFold
# train_X_data = pd.concat([train_X_data,vali_X_data])
# train_Y_data = pd.concat([train_Y_data,vali_label])
# train_X_data.reset_index()
# train_Y_data.reset_index()
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=2018)
submit = []
vali_f1 = []
for k,(train_in,test_in) in enumerate(skf.split(train_X_data,train_Y_data)):
    print('kflod:',k)
    train_X,test_X,train_y,test_y  = train_X_data.loc[train_in],train_X_data.loc[test_in],train_Y_data.loc[train_in],train_Y_data.loc[test_in]
    
    clf = lgb.LGBMClassifier(
    boosting_type='gbdt', subsample=1, colsample_bytree=1,
    max_depth=-1, n_estimators=10000, objective='binary',min_child_weight = 10,
    subsample_freq=1, num_leaves=127, reg_alpha=0,reg_lambda = 1.3,
    random_state=2018, n_jobs=-1, learning_rate=0.1)

    clf.fit(train_X, train_y,eval_set=[(train_X,train_y),(test_X,test_y)], eval_metric='logloss',verbose = 50, early_stopping_rounds=150)
    
    predict = clf.predict_proba(vali_X_data,num_iteration=clf.best_iteration_)
    best_f1,best_thr = find_best_thr(predict,vali_label)
    vali_f1.append(best_f1)
    
    test_pred = pd.DataFrame(clf.predict_proba(test_X_data,num_iteration=clf.best_iteration_))[1]
    submit.append(test_pred)

for i in range(0,4):
    submit[0] += submit[i]
    vali_f1[0] += vali_f1[i]
avg_vali_f1 = vali_f1[0]/5
avg_test_pred = submit[0]/5
print('avg_test_f1:',avg_vali_f1)
avg_test_pred.to_csv('avg_test_pred.csv',index=False,encoding='utf-8',header=None)

kflod: 0
Training until validation scores don't improve for 150 rounds.
[50]	training's binary_logloss: 0.504774	valid_1's binary_logloss: 0.504718
[100]	training's binary_logloss: 0.486408	valid_1's binary_logloss: 0.486919
[150]	training's binary_logloss: 0.476188	valid_1's binary_logloss: 0.477379
[200]	training's binary_logloss: 0.469549	valid_1's binary_logloss: 0.471411
[250]	training's binary_logloss: 0.464549	valid_1's binary_logloss: 0.467065
[300]	training's binary_logloss: 0.460237	valid_1's binary_logloss: 0.463463
[350]	training's binary_logloss: 0.456288	valid_1's binary_logloss: 0.46019
[400]	training's binary_logloss: 0.452888	valid_1's binary_logloss: 0.457483
[450]	training's binary_logloss: 0.45012	valid_1's binary_logloss: 0.45536
[500]	training's binary_logloss: 0.447502	valid_1's binary_logloss: 0.453377
[550]	training's binary_logloss: 0.445085	valid_1's binary_logloss: 0.451574
[600]	training's binary_logloss: 0.442833	valid_1's binary_logloss: 0.450014
[650]	tr

[5300]	training's binary_logloss: 0.369608	valid_1's binary_logloss: 0.4231
[5350]	training's binary_logloss: 0.369219	valid_1's binary_logloss: 0.423077
[5400]	training's binary_logloss: 0.368802	valid_1's binary_logloss: 0.423034
[5450]	training's binary_logloss: 0.36838	valid_1's binary_logloss: 0.422981
[5500]	training's binary_logloss: 0.367969	valid_1's binary_logloss: 0.422932
[5550]	training's binary_logloss: 0.36758	valid_1's binary_logloss: 0.422916
[5600]	training's binary_logloss: 0.367155	valid_1's binary_logloss: 0.422875
[5650]	training's binary_logloss: 0.366766	valid_1's binary_logloss: 0.422835
[5700]	training's binary_logloss: 0.366379	valid_1's binary_logloss: 0.422822
[5750]	training's binary_logloss: 0.365978	valid_1's binary_logloss: 0.422781
[5800]	training's binary_logloss: 0.365566	valid_1's binary_logloss: 0.42271
[5850]	training's binary_logloss: 0.365232	valid_1's binary_logloss: 0.422682
[5900]	training's binary_logloss: 0.36491	valid_1's binary_logloss: 0

[3350]	training's binary_logloss: 0.388323	valid_1's binary_logloss: 0.427141
[3400]	training's binary_logloss: 0.387703	valid_1's binary_logloss: 0.426967
[3450]	training's binary_logloss: 0.387104	valid_1's binary_logloss: 0.426818
[3500]	training's binary_logloss: 0.386537	valid_1's binary_logloss: 0.426702
[3550]	training's binary_logloss: 0.386007	valid_1's binary_logloss: 0.426597
[3600]	training's binary_logloss: 0.385503	valid_1's binary_logloss: 0.426535
[3650]	training's binary_logloss: 0.384946	valid_1's binary_logloss: 0.426459
[3700]	training's binary_logloss: 0.384372	valid_1's binary_logloss: 0.426322
[3750]	training's binary_logloss: 0.383767	valid_1's binary_logloss: 0.426191
[3800]	training's binary_logloss: 0.383218	valid_1's binary_logloss: 0.426076
[3850]	training's binary_logloss: 0.382616	valid_1's binary_logloss: 0.425931
[3900]	training's binary_logloss: 0.382056	valid_1's binary_logloss: 0.425802
[3950]	training's binary_logloss: 0.381541	valid_1's binary_logl

[1300]	training's binary_logloss: 0.421799	valid_1's binary_logloss: 0.439246
[1350]	training's binary_logloss: 0.420647	valid_1's binary_logloss: 0.43873
[1400]	training's binary_logloss: 0.419534	valid_1's binary_logloss: 0.438206
[1450]	training's binary_logloss: 0.418432	valid_1's binary_logloss: 0.437743
[1500]	training's binary_logloss: 0.4173	valid_1's binary_logloss: 0.437255
[1550]	training's binary_logloss: 0.41621	valid_1's binary_logloss: 0.436753
[1600]	training's binary_logloss: 0.415181	valid_1's binary_logloss: 0.436336
[1650]	training's binary_logloss: 0.414219	valid_1's binary_logloss: 0.435936
[1700]	training's binary_logloss: 0.413284	valid_1's binary_logloss: 0.435585
[1750]	training's binary_logloss: 0.412381	valid_1's binary_logloss: 0.435217
[1800]	training's binary_logloss: 0.411502	valid_1's binary_logloss: 0.434866
[1850]	training's binary_logloss: 0.410567	valid_1's binary_logloss: 0.434507
[1900]	training's binary_logloss: 0.409614	valid_1's binary_logloss:

[6600]	training's binary_logloss: 0.360039	valid_1's binary_logloss: 0.423757
[6650]	training's binary_logloss: 0.359714	valid_1's binary_logloss: 0.42374
[6700]	training's binary_logloss: 0.359399	valid_1's binary_logloss: 0.423725
[6750]	training's binary_logloss: 0.359088	valid_1's binary_logloss: 0.423716
[6800]	training's binary_logloss: 0.358768	valid_1's binary_logloss: 0.423703
[6850]	training's binary_logloss: 0.358442	valid_1's binary_logloss: 0.423685
[6900]	training's binary_logloss: 0.358108	valid_1's binary_logloss: 0.423675
[6950]	training's binary_logloss: 0.357806	valid_1's binary_logloss: 0.423671
[7000]	training's binary_logloss: 0.357499	valid_1's binary_logloss: 0.423659
[7050]	training's binary_logloss: 0.357199	valid_1's binary_logloss: 0.423672
[7100]	training's binary_logloss: 0.356889	valid_1's binary_logloss: 0.423661
[7150]	training's binary_logloss: 0.35658	valid_1's binary_logloss: 0.423663
[7200]	training's binary_logloss: 0.356301	valid_1's binary_loglos

[4500]	training's binary_logloss: 0.376578	valid_1's binary_logloss: 0.425384
[4550]	training's binary_logloss: 0.376118	valid_1's binary_logloss: 0.42532
[4600]	training's binary_logloss: 0.375648	valid_1's binary_logloss: 0.42525
[4650]	training's binary_logloss: 0.37517	valid_1's binary_logloss: 0.42519
[4700]	training's binary_logloss: 0.374712	valid_1's binary_logloss: 0.425115
[4750]	training's binary_logloss: 0.374279	valid_1's binary_logloss: 0.425065
[4800]	training's binary_logloss: 0.373844	valid_1's binary_logloss: 0.425007
[4850]	training's binary_logloss: 0.373394	valid_1's binary_logloss: 0.424957
[4900]	training's binary_logloss: 0.372972	valid_1's binary_logloss: 0.424913
[4950]	training's binary_logloss: 0.372514	valid_1's binary_logloss: 0.424835
[5000]	training's binary_logloss: 0.372114	valid_1's binary_logloss: 0.424783
[5050]	training's binary_logloss: 0.371753	valid_1's binary_logloss: 0.424753
[5100]	training's binary_logloss: 0.371304	valid_1's binary_logloss:

[2850]	training's binary_logloss: 0.394704	valid_1's binary_logloss: 0.428912
[2900]	training's binary_logloss: 0.394007	valid_1's binary_logloss: 0.428724
[2950]	training's binary_logloss: 0.393354	valid_1's binary_logloss: 0.428532
[3000]	training's binary_logloss: 0.392732	valid_1's binary_logloss: 0.428369
[3050]	training's binary_logloss: 0.392103	valid_1's binary_logloss: 0.428215
[3100]	training's binary_logloss: 0.391476	valid_1's binary_logloss: 0.428072
[3150]	training's binary_logloss: 0.390826	valid_1's binary_logloss: 0.427904
[3200]	training's binary_logloss: 0.390187	valid_1's binary_logloss: 0.427697
[3250]	training's binary_logloss: 0.389573	valid_1's binary_logloss: 0.42756
[3300]	training's binary_logloss: 0.389034	valid_1's binary_logloss: 0.427438
[3350]	training's binary_logloss: 0.388515	valid_1's binary_logloss: 0.427358
[3400]	training's binary_logloss: 0.387929	valid_1's binary_logloss: 0.427205
[3450]	training's binary_logloss: 0.387373	valid_1's binary_loglo