In [None]:
import time

import Levenshtein
from gensim import matutils
import pandas as pd
import numpy as np
import json
import re
import warnings
import os
import jieba
from sklearn.preprocessing import LabelEncoder
from operator import itemgetter
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold as skf
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics import f1_score
import gc



def get_data():
    if mode == "test":
        train = pd.read_table('/home/ccit/tkhoon/data/sdata_train.csv', header=None,
                              names=['prefix', 'query_prediction', 'title', 'tag', 'label', '1'], quoting=3)
        vali = pd.read_table('/home/ccit/tkhoon/data/sdata_vali.csv', header=None, names=['prefix', 'query_prediction', 'title', 'tag', 'label'],
                             quoting=3)
        test = pd.read_table('/home/ccit/tkhoon/data/sdata_test.csv', header=None, names=['prefix', 'query_prediction', 'title', 'tag', '1'],
                             quoting=3)
    else:
        train = pd.read_table('/home/ccit/tkhoon/data/data_train.csv', header=None,
                              names=['prefix', 'query_prediction', 'title', 'tag', 'label', '1'], quoting=3)
        vali = pd.read_table('/home/ccit/tkhoon/data/data_vali.csv', header=None, names=['prefix', 'query_prediction', 'title', 'tag', 'label'],
                             quoting=3)
        test = pd.read_table('/home/ccit/tkhoon/data/data_test.csv', header=None, names=['prefix', 'query_prediction', 'title', 'tag', '1'],
                             quoting=3)


    train_temp = train[train['1'].notnull()]
    test_temp = test[test['1'].notnull()]

    train_index = list(train_temp.index)
    test_index = list(test_temp.index)

    train.loc[train_index, 'tag'] = train.loc[train_index, 'label']
    train.loc[train_index, 'label'] = train.loc[train_index, '1']

    test.loc[test_index, 'tag'] = test.loc[test_index, '1']

    train.drop('1', axis=1, inplace=True)
    test.drop('1', axis=1, inplace=True)


    test['label'] = -1
    train['flag'] = 1
    vali['flag'] = 2
    test['flag'] = 3
    data = pd.concat([train, vali, test])
    data = data.reset_index()
    data.drop('index', axis=1, inplace=True)

    columns = ['prefix', 'query_prediction', 'title', 'tag']
    for column in columns:
        data[column] = data[column].astype(str)
    data.drop( data[ data["label"].isnull() ].index , inplace=True )
    data['label'] = data['label'].astype(int)
    return data

def char_process(char):
    # 提出无效字符
    try:
        char =  re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+??！，。？?、~@#￥%……&*（）:]+", "", char)
        return char
    except:
        return char

def is_prefix_contains_upper_english(data):
    judge = data['prefix'].apply(lambda x: len(re.findall("[A-Z]", x)) > 0 )
    # data[judge]
    data['is_prefix_contains_upper_english'] = 0
    data.loc[judge, 'is_prefix_contains_upper_english'] = 1
    return data

def char_lowwer_process(item):
        if (item['is_prefix_contains_upper_english'] == 1) & (item['query_prediction'] == 'nan'):
            return str.lower(item['prefix'])
        else :
            return item['prefix']

def title_char_lowwer_process(item):
        if (item['is_prefix_contains_upper_english'] == 1) & (item['query_prediction'] == 'nan'):
            return str.lower(item['title'])
        else :
            return item['title']

def char_cleaner(char):
    if not isinstance(char, str):
        char = "null"
    pattern = re.compile("[^0-9a-zA-Z\u4E00-\u9FA5 ]")
    char = re.sub(pattern, "", char)
    char = char.lower()
    return char

def is_prefix_contains_upper_english(data):
    judge = data['prefix'].apply(lambda x: len(re.findall("[A-Z]", x)) > 0 )
    # data[judge]
    data['is_prefix_contains_upper_english'] = 0
    data.loc[judge, 'is_prefix_contains_upper_english'] = 1
    return data

def query_process(item):
    try:
        item['query_prediction'] = json.loads(item['query_prediction'])
        return item['query_prediction']
    except:
        return '{}'

def combine_tag(item):
    if item['tag'] == '网页':
        return '网站'
    else:
        return item['tag']
complete_prefix_map={}
def get_complete_prefix(item):
        prefix = item['prefix']
        complete_prefix = complete_prefix_map.get(prefix  )
        if complete_prefix is not None:
            return  complete_prefix
        query_prediction = item['query_prediction']

        if query_prediction == '{}':
            return prefix

        predict_word_dict = dict()
        prefix = str(prefix)

        for query_item, query_ratio in query_prediction.items():
            query_item_cut = jieba.lcut(query_item)
            item_word = ""
            for item in query_item_cut:
                if prefix not in item_word:
                    item_word += item
                else:
                    if item_word not in predict_word_dict.keys():
                        predict_word_dict[item_word] = 0.0
                    predict_word_dict[item_word] += float(query_ratio)

        if not predict_word_dict:
            return prefix

        predict_word_dict = sorted(predict_word_dict.items(), key=itemgetter(1), reverse=True)
        complete_prefix = predict_word_dict[0][0]
        complete_prefix_map[ prefix ] = complete_prefix
        return complete_prefix

def run_process(data):

    data['prefix'] = data['prefix'].apply(char_process) #提出无效字符j
    data['title'] = data['title'].apply(char_process) #title也去掉，之后涉及到计算相似度问题
    data  = is_prefix_contains_upper_english(data) #判断prefix是否含有大写
    data['prefix'] = data.apply(char_lowwer_process,axis=1) #将含有大写的prefix转为小写    有转化成小写的 ,query一定为空
    data['title'] = data.apply(title_char_lowwer_process,axis=1)#把title也转换成小写
    data['tag'] = data.apply(combine_tag,axis=1) #合并tag
    return data

def get_prefix_query_dic(data):
    prefix_dic = {}
    for index,row in data.iterrows():
        if row['query_prediction'] != 'nan' and row['prefix'] not in prefix_dic:
            prefix_dic[row['prefix']] = row['query_prediction']
    return prefix_dic

def null_query_prediction_process(item):
    if item['query_prediction'] == 'nan' and item['prefix'] in prefix_dic:
        return prefix_dic[item['prefix']]
    else:
        return item['query_prediction']

def move_useless_char(s):
    # 提出无效字符
    return re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+??！，。？?、~@#￥%……&*（）:]+", "", s)

def query_prediction_text(query_prediction):
    if query_prediction == '{}':
        return ['PAD'];
    else:
        query_word = []
        for i in query_prediction.keys():
            query_word.append(i)
    #移除query_word无效字符
    for i in range(len(query_word)):
        query_word[i] = move_useless_char(query_word[i])
    return query_word

def query_prediction_score(query_prediction):
    if query_prediction == '{}':
        return np.nan
    else:
        query_score = []
        for i in query_prediction.values():
            query_score.append(float(i))
    return query_score

def get_query_list_feature(data):
    data['query_word'] = data['query_prediction'].apply(lambda x : query_prediction_text(x))
    data['query_score'] = data['query_prediction'].apply(lambda x: query_prediction_score(x))
    return data

def get_word_length(item):
    word_cut = jieba.lcut(item)
    return len(word_cut)

def title_is_in_query(item):
    if item['query_prediction'] == '{}' or item['title'] not in item['query_word']:
        return 0
    else:
        return 1

def prefix_is_in_title(item):
    if item['prefix'] == 'nan' or item['title'] == 'nan' or item['prefix'] not in item['title']:
        return 0
    else:
        return 1

def prefix_is_network(item):
    if 'www' in item or 'com' in item or 'http' in item:
        return 1

    else:
        return 0

def title_is_network(item):
    if 'www' in item or 'com' in item or 'http' in item:
        return 1
    else:
        return 0

def prefix_is_question(item):
    if '怎么' in item or '什么' in item or '哪' in item or '多少' in item or '谁' in item or '如何' in item:
        return 1
    else:
        return 0

def title_is_question(item):
    if '怎么' in item or '什么' in item or '哪' in item or '多少' in item or '谁' in item or '如何' in item:
        return 1
    else:
        return 0

def prefix_title_leve_dist(item):
    try:
        return Levenshtein.distance(item['prefix'], item['title'])
    except:
        return 0

def prefix_title_leve_rate(item):
    try:
        return Levenshtein.ratio(item['prefix'], item['title'])
    except:
        return 0
def get_word_w2v_model():
    w2v_model_name = "baike_26g_news_13g_novel_229g.bin"
    w2v_model_path = os.path.join("resources", w2v_model_name)
    w2v_model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=True, unicode_errors="ignore")
    return w2v_model

size = 100

def char_cleaner(char):
    if not isinstance(char, str):
        char = "null"
    pattern = re.compile("[^0-9a-zA-Z\u4E00-\u9FA5 ]")
    char = re.sub(pattern, "", char)
    char = char.lower()
    return char

def _get_jieba_array(words):
    words = char_cleaner(words)
    seg_cut = jieba.lcut(words)

    w2v_array = list()
    for word in seg_cut:
        try:
            similar_list = word_w2v_model[word]
            w2v_array.append(similar_list)
        except KeyError:
            continue

    if not w2v_array:
        w2v_array = [None] * size
    else:
        w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0))
        return w2v_array

def get_query_w2v_similar(item):
    item_dict = {}
    query_prediction = item['query_prediction']
    title = item['title']  # 等下再求下prefix得呢
    prefix = item['prefix']
    if query_prediction == '{}':
        item_dict['prefix_max_similar'] = None
        item_dict['prefix_mean_similar'] = None
        item_dict['prefix_weight_similar'] = None
        item_dict['title_max_similar'] = None
        item_dict['title_mean_similar'] = None
        item_dict['title_weight_similar'] = None
        return item_dict

    query_prediction = sorted(query_prediction.items(), key=itemgetter(1), reverse=True)
    query_prediction = query_prediction[:3]
    similar_list = []
    weight_similar_list = []
    title_array = _get_jieba_array(item['title'])
    prefix_array = _get_jieba_array(item['prefix'])

    for key, value in query_prediction:

        query_cut_array = _get_jieba_array(key)
        try:
            w2v_similar = np.dot(query_cut_array, title_array)
        except (KeyError, ZeroDivisionError, TypeError):
            w2v_similar = np.nan

        similar_list.append(w2v_similar)
        weight_w2v_similar = w2v_similar * float(value)
        weight_similar_list.append(weight_w2v_similar)

        max_similar = np.nanmax(similar_list)
        mean_similar = np.nanmean(similar_list)
        weight_similar = np.nansum(weight_similar_list)

        item_dict["title_max_similar"] = max_similar
        item_dict["title_mean_similar"] = mean_similar
        item_dict["title_weight_similar"] = weight_similar

    for key, value in query_prediction:

        query_cut_array = _get_jieba_array(key)
        try:
            w2v_similar = np.dot(query_cut_array, prefix_array)
        except (KeyError, ZeroDivisionError, TypeError):
            w2v_similar = np.nan

        similar_list.append(w2v_similar)
        weight_w2v_similar = w2v_similar * float(value)
        weight_similar_list.append(weight_w2v_similar)

        max_similar = np.nanmax(similar_list)
        mean_similar = np.nanmean(similar_list)
        weight_similar = np.nansum(weight_similar_list)

        item_dict["prefiix_max_similar"] = max_similar
        item_dict["prefix_mean_similar"] = mean_similar
        item_dict["prefix_weight_similar"] = weight_similar

        return item_dict

def get_prefix_w2v_similar(item):
    title = item['title']
    prefix = item['prefix']
    title_array = _get_jieba_array(item['title'])
    prefix_array = _get_jieba_array(item['prefix'])
    try:
        w2v_similar = np.dot(prefix_array, title_array)
    except (KeyError, ZeroDivisionError, TypeError):
        w2v_similar = np.nan
    return w2v_similar

# prefix也要加上
def get_query_sim_feature(data):
    start = time.time()

    data["item_dict"] = data.apply(get_query_w2v_similar, axis=1)

    data['prefix_title_sim'] = data.apply(get_prefix_w2v_similar, axis=1)

    print(start - time.time())
    return data

def get_feature0(item):
    try:
        return item['item_dict']['title_max_similar']
    except:
        return np.nan


def get_feature1(item):
    try:
        return item['item_dict']['title_mean_similar']
    except:
        return np.nan


def get_feature2(item):
    try:
        return item['item_dict']['title_weight_similar']
    except:
        return np.nan


def get_feature3(item):
    try:
        return item['item_dict']['prefiix_max_similar']
    except:
        return np.nan


def get_feature4(item):
    try:
        return item['item_dict']['prefix_mean_similar']
    except:
        return np.nan


def get_feature5(item):
    try:
        return item['item_dict']['prefix_weight_similar']
    except:
        return np.nan


def get_feature6(item):
    try:
        return max(item['query_score'])
    except:
        return np.nan


def get_feature7(item):
    try:
        return min(item['query_score'])
    except:
        return np.nan


def get_feature8(item):
    try:
        return np.mean(item['query_score'])
    except:
        return np.nan

mode = "train"
data = get_data()
data = run_process(data)
prefix_dic = get_prefix_query_dic(data)
data['query_prediction'] = data.apply(null_query_prediction_process,axis=1)
data['query_prediction'] = data.apply(query_process,axis=1)
data['prefix'] = data.apply(get_complete_prefix,axis=1)  #把complete_prefix也当作基础特征
data = get_query_list_feature(data)
print(1)
data['prefix_word_len'] = data['prefix'].apply(lambda x: get_word_length(x))
data['title_word_len'] = data['title'].apply(lambda x: get_word_length(x))
data['title-prefix_word_len'] = data['title_word_len'] - data['prefix_word_len']
data['prefix_is_question'] = data['prefix'].apply(lambda x: prefix_is_question(x))
data['title_is_question'] = data['title'].apply(lambda x: title_is_question(x))
data['title_is_network'] = data['title'].apply(lambda x: title_is_network(x))
data['prefix_is_network'] = data['prefix'].apply(lambda x: prefix_is_network(x))
data['prefix_title_leve_dist'] = data.apply(prefix_title_leve_dist, axis=1)
data['prefix_title_leve_rate'] = data.apply(prefix_title_leve_rate, axis=1)
print(2)
data['query_prediction_len'] = data['query_word'].apply(lambda x: len(x))
data['prefix_len'] = data['prefix'].apply(lambda x: len(x))
data['title_len'] = data['title'].apply(lambda x: len(x))
data['title-prefix_len'] = data['title_len'] - data['prefix_len']
data['title_is_in_query'] = data.apply(title_is_in_query, axis=1)
data['is_prefix_in_title'] = data.apply(prefix_is_in_title, axis=1)
print('start_w2v_feature')

word_w2v_model = get_word_w2v_model()
data = get_query_sim_feature(data)

data['title_max_similar'] = data.apply(get_feature0, axis=1)
data['title_mean_similar'] = data.apply(get_feature1, axis=1)
data['title_weight_similar'] = data.apply(get_feature2, axis=1)
data['prefix_max_similar'] = data.apply(get_feature3, axis=1)
data['prefix_mean_similar'] = data.apply(get_feature4, axis=1)
data['prefix_weight_similar'] = data.apply(get_feature5, axis=1)
data['query_score_max'] = data.apply(get_feature6, axis=1)
data['query_score_min'] = data.apply(get_feature7, axis=1)
data['query_score_mean'] = data.apply(get_feature8, axis=1)

print('basic_feature_finish')


In [None]:
#pingjie train ,test ,vali

In [None]:
#model
def kmeans(data):
    from sklearn.cluster import KMeans
    features = ['prefix','title','tag']
    for feature in features:
        data[feature] = LabelEncoder().fit_transform(data[feature])
    columns = ['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score']
    data_ = data.drop(columns,axis=1)
    kmeans = KMeans(n_clusters=25,init='k-means++',max_iter=300,verbose=1,n_jobs=-1)
    a = kmeans.fit_predict(data_)
    data['kmeans'] = a
    del data_
    return data

def run_feature(data):
    features = ['kmeans','prefix', 'title', 'tag','is_prefix_in_title','title_is_in_query'
                ,'prefix_word_len','title_word_len','prefix_is_question','prefix_is_network','query_score_mean','title_weight_similar'
               ,'query_prediction_len','prefix_len','title_len','prefix_weight_similar']
    for feature in features:
        a = data[feature].value_counts().to_dict()
        data[feature+'_count'] = data[feature].apply(lambda x:a[x])
        del a
        gc.collect()
        print(feature)
    print('part_1_finish')
    gc.collect()
    
    for  i in range(len(features)):
        for j in range(i+1,len(features)):
            new_feature = features[i]+'_'+features[j]
            data[new_feature] = data[features[i]].astype(str) + '_' + data[features[j]].astype(str)
            data[new_feature] = LabelEncoder().fit_transform(data[new_feature])
            new_feature_count = new_feature + '_count'
            a = data[new_feature].value_counts().to_dict()
            data[new_feature_count] = data[new_feature].apply(lambda x : a[x])
            gc.collect()
            del a
            print(i)
    print('part2_finish')
    gc.collect()
    
    pos_features = ['kmeans','title_weight_similar_bin', 'title_mean_similar_bin','title_max_similar_bin','prefix_title_sim_bin','query_prediction_len','prefix_len','title_len',
                    'title-prefix_len','prefix_word_len','title_word_len','title-prefix_word_len','prefix_max_similar_bin','prefix_mean_similar_bin',
                   'prefix_weight_similar_bin']
    for feature in pos_features:
        train = data[(data['flag'] == 1)]
        temp = train.groupby(feature,as_index=False)['label'].agg({feature+'_click_':'sum',feature+'_count':'count'})
        data = pd.merge(data,temp,on=feature,how='left')
        del train
        del temp
        gc.collect()
        print(feature)
    print('part3_finish')

    features = ['prefix','title','tag']
    for feature in features:
        data[feature] = LabelEncoder().fit_transform(data[feature])
    return data

def co_feature_del(data):
    threshold = 0.99
    # Absolute value correlation matrix
    corr_matrix = data.corr().abs()
    # corr_matrix.head()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # upper.head()
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print('There are %d columns to remove.' % (len(to_drop)))
    data.drop(to_drop, axis=1, inplace=True)
    
    del upper
    del to_drop
    del corr_matrix
    return data

data['prefix_max_similar'] = data['prefix_max_similar'].fillna(data['prefix_max_similar'].mean())
data['prefix_title_sim_bin'] = pd.qcut(data['prefix_title_sim'],8)
data['prefix_title_sim_bin'] = pd.factorize(data['prefix_title_sim_bin'])[0]

data['title_weight_similar_bin'] = pd.qcut(data['title_weight_similar'],10)
data['title_weight_similar_bin'] = pd.factorize(data['title_weight_similar_bin'])[0]

data['title_mean_similar_bin'] = pd.qcut(data['title_mean_similar'],5)
data['title_mean_similar_bin'] = pd.factorize(data['title_mean_similar_bin'])[0]

data['title_max_similar_bin'] = pd.qcut(data['title_max_similar'],5)
data['title_max_similar_bin']  = pd.factorize(data['title_max_similar_bin'])[0]

data['prefix_weight_similar_bin'] = pd.qcut(data['prefix_weight_similar'],10)
data['prefix_weight_similar_bin'] = pd.factorize(data['prefix_weight_similar_bin'])[0]

data['prefix_mean_similar_bin'] = pd.qcut(data['prefix_mean_similar'],5)
data['prefix_mean_similar_bin'] = pd.factorize(data['prefix_mean_similar_bin'])[0]

data['prefix_max_similar_bin'] = pd.qcut(data['prefix_max_similar'],3)
data['prefix_max_similar_bin']  = pd.factorize(data['prefix_max_similar_bin'])[0]
print('run_kmeans')
data = kmeans(data)
print('run_feature')
gc.collect()
data = run_feature(data)
print('run_co_feature_del')
data = co_feature_del(data)
gc.collect()

In [None]:
#predict
def pre_data(data):
    # 先看一下，没有加入作为特征得
    train = data[data['flag'] == 1]
    vali = data[data['flag'] == 2]
    test = data[data['flag'] == 3]

    train_X_data = train.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                              axis=1)
    vali_X_data = vali.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                            axis=1)
    test_X_data = test.drop(['query_prediction', 'label', 'flag', 'is_prefix_contains_upper_english', 'item_dict','query_word','query_score'],
                            axis=1)
    train_Y_data = train['label']
    vali_label = vali['label']
    
    del train
    del vali
    del test
    
    return train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label
def lgb_test(train_X_data,vali_X_data,train_Y_data,vali_label):

    predict = []
    clf = lgb.LGBMClassifier(
    boosting_type='gbdt', subsample=1, colsample_bytree=1,
    max_depth=-1, n_estimators=10000, objective='binary',min_child_weight = 10,
    subsample_freq=1, num_leaves=127, reg_alpha=0,reg_lambda = 1.3,
    random_state=2018, n_jobs=-1, learning_rate=0.1)

    clf.fit(train_X_data, train_Y_data,eval_set=[(train_X_data,train_Y_data),(vali_X_data,vali_label)], eval_metric='logloss',verbose = 50, early_stopping_rounds=100)
    predict = clf.predict_proba(vali_X_data,num_iteration=clf.best_iteration_)
    return predict,clf

def find_best_thr(predict,vali_label):
    max = 0.0
    max_i =0.0
    predict = pd.DataFrame(predict)
    predict = predict[1]
    predict = pd.DataFrame(predict)
    for i in np.arange(0.25, 0.4500, 0.001):
        f1 = f1_score(vali_label, predict[1].map(lambda x: 0 if x < i else 1))
        if (f1 > max):
            max = f1_score(vali_label, predict[1].map(lambda x: 0 if x <= i else 1))
            max_i = i
    print('最大f1为', max)
    print('此时阈值为:', max_i)

    return max,max_i

train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label = pre_data(data)
print('run_lgb_test')
predict,clf = lgb_test(train_X_data,vali_X_data,test_X_data,train_Y_data,vali_label)