In [1]:
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [8]:
import urllib

In [2]:
def read_vectors(path, topn):  # read top n word vectors, i.e. top is 10000
    lines_num, dim = 0, 0
    vectors = {}
    iw = []
    wi = {}
    with open(path, encoding='utf-8', errors='ignore') as f:
        first_line = True
        for line in f:
            if first_line:
                first_line = False
                dim = int(line.rstrip().split()[1])
                continue
            lines_num += 1
            tokens = line.rstrip().split(' ')
            vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
            iw.append(tokens[0])
            if topn != 0 and lines_num >= topn:
                break
    for i, w in enumerate(iw):
        wi[w] = i
    return vectors, iw, wi, dim

In [3]:
df = pd.read_csv('./feature/df_feature4_ctr_extra.csv', encoding='utf-8', usecols=['prefix', 'title', 'dict_len', 'query_prediction', 'query_prediction_origin'])

df.shape

(2100000, 5)

In [9]:
df['title'] = df.title.apply(lambda x: urllib.parse.unquote(x))
df['prefix'] = df.prefix.apply(lambda x: urllib.parse.unquote(x))

In [4]:
%%time
# df['query_prediction'] = df.apply(lambda x: eval(x.query_prediction) if x.dict_len != 0 else {}, axis=1)
# df['query_prediction_origin'] = df.apply(lambda x: eval(x.query_prediction_origin) if x.dict_len != 0 else {}, axis=1)

df['max_query'] = df.apply(lambda x: list(eval(x.query_prediction).keys())[0] if x.dict_len != 0 else '', axis=1)
df['first_query'] = df.apply(lambda x: list(eval(x.query_prediction_origin).keys())[0] if x.dict_len != 0 else '', axis=1)

Wall time: 3min 35s


## 词向量

In [4]:
import jieba

In [5]:
w2v = read_vectors('./data/new/merge_sgns_bigram_char300.txt', 0)

In [6]:
def get_sent_vector(x):
    tmp_list = list(jieba.cut(x))
    vec = [0] * 300
    vec_num = 0
    for word in tmp_list:
        if word in w2v[0]:
            vec += w2v[0][word]
            vec_num += 1
        else:
            for letter in word:
                if letter in w2v[0]:
                    vec += w2v[0][letter]
                    vec_num += 1
    return np.array(vec) / (vec_num + 1)

def get_sent_dict_vector(x):
    vec = np.array([0] * 300)
    weight = sum(np.array(list(x.values())).astype(float))
    for k, v in x.items():
        vec_num = 0
        tmp_vec = [0] * 300
        tmp_list = list(jieba.cut(k))
        for word in tmp_list:
            if word in w2v[0]:
                tmp_vec += w2v[0][word]
                vec_num += 1
            else:
                for letter in word:
                    if letter in w2v[0]:
                        tmp_vec += w2v[0][letter]
                        vec_num += 1
        tmp_vec = np.array(tmp_vec) / (vec_num + 1)
        vec = vec + tmp_vec * float(v)
    return vec / (weight + 1)

In [7]:
from sklearn.metrics.pairwise import pairwise_distances

In [9]:
%%time
df['max_query_vec'] = df.max_query.apply(get_sent_vector)
df['first_query_vec'] = df.first_query.apply(get_sent_vector)
df['title_vec'] = df.title.apply(get_sent_vector)

df['title_max_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.max_query_vec), axis=1)
df['title_first_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.first_query_vec), axis=1)
df['title_max_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.max_query_vec], metric='cosine')[0, 1], axis=1)
df['title_first_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.first_query_vec] ,metric='cosine')[0, 1], axis=1)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ZERO\AppData\Local\Temp\jieba.cache
Loading model cost 0.790 seconds.
Prefix dict has been built succesfully.


Wall time: 55min 4s


In [10]:
# df[['title_max_distance', 'title_first_distance', 'title_max_cos_distance', 'title_first_cos_distance',]].to_csv('./feature/feature_maxfirst_dis.csv', index=None)

In [18]:
%%time
df['query_str_joint'] = df.query_prediction.apply(lambda x: ''.join(list(eval(x).keys())))
df['query_origin_str_joint'] = df.query_prediction_origin.apply(lambda x: ''.join(list(eval(x).keys())))

# df['title_vec'] = df.title.apply(get_sent_vector)
df['query_str_vec'] = df.query_str_joint.apply(get_sent_vector)
df['query_origin_str_vec'] = df.query_origin_str_joint.apply(get_sent_vector)

df['title_str_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.query_str_vec), axis=1)
df['title_origin_str_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.query_origin_str_vec), axis=1)
df['title_str_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.query_str_vec], metric='cosine')[0, 1], axis=1)
df['title_origin_str_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.query_origin_str_vec], metric='cosine')[0, 1], axis=1)

Wall time: 1h 36min 16s


In [20]:
df[['title_max_distance', 'title_first_distance', 'title_max_cos_distance', 'title_first_cos_distance', 'title_str_distance', 'title_origin_str_distance', 'title_str_cos_distance', 'title_origin_str_cos_distance']].to_csv('./feature/feature_vector_dis.csv', index=None)

In [24]:
%%time
df['query_dict_vec'] = df.query_prediction.apply(get_sent_dict_vector)
df['title_vec'] = df.title.apply(get_sent_vector)

df['title_query_dict_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.query_dict_vec), axis=1)
df['title_query_dict_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.query_dict_vec] ,metric='cosine')[0, 1], axis=1)

Wall time: 23min 49s


In [26]:
df[['title_query_dict_distance', 'title_query_dict_cos_distance']].to_csv('./feature/feature_dict_dis.csv', index=None)

In [11]:
%%time
df['title_vec'] = df.title.apply(get_sent_vector)
df['prefix_vec'] = df.prefix.apply(get_sent_vector)

df['title_prefix_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.prefix_vec), axis=1)
df['title_prefix_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.prefix_vec], metric='cosine')[0, 1], axis=1)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ZERO\AppData\Local\Temp\jieba.cache
Loading model cost 0.792 seconds.
Prefix dict has been built succesfully.


Wall time: 13min 11s


In [22]:
def get_str_set(x):
    tmp = []
    for i in list(jieba.cut(x)):
        if i not in tmp:
            tmp.append(i)
    return ''.join(tmp)

In [29]:
%%time
# df['query_str_joint'] = df.query_prediction.apply(lambda x: ''.join(list(eval(x).keys())))
# df['query_str_joint'] = df.query_str_joint.apply(get_str_set)
                                                  
df['query_set_str_vec'] = df.query_str_joint.apply(get_sent_vector)

df['title_query_str_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.query_set_str_vec), axis=1)
df['title_query_str_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.query_set_str_vec] ,metric='cosine')[0, 1], axis=1)

Wall time: 30min 8s


In [31]:
import re

In [41]:
%%time
df['prefix_has_symbol'] = df.prefix.apply(lambda x: 0 if re.search("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", x) == None else 1)
df['title_has_symbol'] = df.title.apply(lambda x: 0 if re.search("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", x) == None else 1)

Wall time: 4.86 s


In [43]:
df = df.drop(['prefix', 'title', 'dict_len', 'query_prediction', 'query_prediction_origin'], axis=1)

In [46]:
df.head()

Unnamed: 0,title_vec,prefix_vec,title_prefix_distance,title_prefix_cos_distance,query_str_joint,query_set_str_vec,title_query_dict_distance,title_query_dict_cos_distance,prefix_has_symbol,title_has_symbol
0,"[0.0337765, -0.1731035, -0.5104975, -0.0241735...","[0.0337765, -0.1731035, -0.5104975, -0.0241735...",0.0,2.220446e-16,小品大全搞笑演员剧本幽默相亲视频宋小宝不差钱,"[0.016928076923076928, -0.1652573846153846, -0...",3.31758,0.343387,0,0
1,"[0.4005244285714285, -0.03125885714285714, 0.0...","[0.81625, -0.3558175, -0.1878285, -0.624525, -...",3.927479,0.4247249,136853678921368年1368个单词就够了13688..cc13688cc赛马会1...,"[0.842020643835616, 0.2105076438356164, 0.0967...",5.002446,0.555026,0,1
2,"[0.6821196666666666, -0.12039066666666666, -0....","[0.81625, -0.3558175, -0.1878285, -0.624525, -...",2.648985,0.1757834,136853678921368年1368个单词就够了13688..cc13688cc赛马会1...,"[0.842020643835616, 0.2105076438356164, 0.0967...",5.272705,0.577531,0,0
3,"[0.2094338, -0.2712448, 0.236579, -0.3817948, ...","[0.044081, 0.1652645, 0.304232, -0.376705, 0.0...",2.863331,0.2085416,银耳红枣汤银耳汤的做法功效莲子羹莲子汤大全为什么不能天天吃,"[0.2041154, -0.13926186666666665, -0.030743666...",2.049898,0.108278,0,0
4,"[0.32899949999999994, -0.15711216666666664, -0...","[0.2517735, -0.032728249999999987, -0.29727975...",2.00714,0.1129009,月经量少是什么原因怎么办怎么调理吃药该喝红糖水好吗发黑,"[0.2247555294117647, -0.15643829411764706, -0....",2.170613,0.138948,0,0


In [45]:
df[['title_prefix_distance','title_prefix_cos_distance','title_query_str_distance','title_query_str_cos_distance','prefix_has_symbol','title_has_symbol']].to_csv('./feature/feature_titpre_dis.csv', index=None)