In [2]:
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [3]:
import jieba
import urllib

In [4]:
def read_vectors(path, topn):  # read top n word vectors, i.e. top is 10000
    lines_num, dim = 0, 0
    vectors = {}
    iw = []
    wi = {}
    with open(path, encoding='utf-8', errors='ignore') as f:
        first_line = True
        for line in f:
            if first_line:
                first_line = False
                dim = int(line.rstrip().split()[1])
                continue
            lines_num += 1
            tokens = line.rstrip().split(' ')
            vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
            iw.append(tokens[0])
            if topn != 0 and lines_num >= topn:
                break
    for i, w in enumerate(iw):
        wi[w] = i
    return vectors, iw, wi, dim

In [5]:
w2v = read_vectors('../w2v/merge_sgns_bigram_char300.txt', 0)

In [7]:
df = pd.read_csv('./feature/featurefull_testB.csv', encoding='gbk', usecols=['prefix', 'title', 'dict_len', 'query_prediction', 'query_prediction_origin'])

df.shape

(250000, 5)

In [8]:
df['title'] = df.title.apply(lambda x: urllib.parse.unquote(x))
df['prefix'] = df.prefix.apply(lambda x: urllib.parse.unquote(x))

In [9]:
%%time
df['query_prediction'] = df.apply(lambda x: eval(x.query_prediction) if x.dict_len != 0 else {}, axis=1)
df['query_prediction_origin'] = df.apply(lambda x: eval(x.query_prediction_origin) if x.dict_len != 0 else {}, axis=1)

df['max_query'] = df.apply(lambda x: list(x.query_prediction.keys())[0] if x.dict_len != 0 else '', axis=1)
df['first_query'] = df.apply(lambda x: list(x.query_prediction_origin.keys())[0] if x.dict_len != 0 else '', axis=1)

Wall time: 35.4 s


## 词向量

In [10]:
def get_sent_vector_max(x):
    tmp_list = list(jieba.cut(x))
    vec = [[0] * 300]
    vec_num = 0
    for word in tmp_list:
        if word in w2v[0]:
            vec.append(w2v[0][word])
            vec_num += 1
        else:
            for letter in word:
                if letter in w2v[0]:
                    vec.append(w2v[0][letter])
                    vec_num += 1
    vec = np.array(vec).max(axis=0)
    return vec / (vec_num + 1)

def get_sent_dict_vector_max(x):
    vec = [[0] * 300]
    weight = sum(np.array(list(x.values())).astype(float))
    for k, v in x.items():
        vec_num = 0
        tmp_vec = []
        tmp_list = list(jieba.cut(k))
        for word in tmp_list:
            if word in w2v[0]:
                tmp_vec.append(w2v[0][word])
                vec_num += 1
            else:
                for letter in word:
                    if letter in w2v[0]:
                        tmp_vec.append(w2v[0][letter])
                        vec_num += 1
        tmp_vec = np.array(vec).max(axis=0)
        tmp_vec = tmp_vec / (vec_num + 1)
        vec.append(tmp_vec * float(v))
    return vec / (weight + 1)

In [11]:
from sklearn.metrics.pairwise import pairwise_distances

In [12]:
%%time
df['max_query_vec'] = df.max_query.apply(get_sent_vector_max)
df['first_query_vec'] = df.first_query.apply(get_sent_vector_max)
df['title_vec'] = df.title.apply(get_sent_vector_max)

df['title_max_distance_maxpool'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.max_query_vec), axis=1)
df['title_first_distance_maxpool'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.first_query_vec), axis=1)
df['title_max_cos_distance_maxpool'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.max_query_vec], metric='cosine')[0, 1], axis=1)
df['title_first_cos_distance_maxpool'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.first_query_vec] ,metric='cosine')[0, 1], axis=1)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ZERO\AppData\Local\Temp\jieba.cache
Loading model cost 0.921 seconds.
Prefix dict has been built succesfully.


Wall time: 2min 42s


In [15]:
%%time
df['query_str_joint'] = df.query_prediction.apply(lambda x: ''.join(list(x.keys())))
df['query_origin_str_joint'] = df.query_prediction_origin.apply(lambda x: ''.join(list(x.keys())))

df['query_str_vec'] = df.query_str_joint.apply(get_sent_vector_max)
df['query_origin_str_vec'] = df.query_origin_str_joint.apply(get_sent_vector_max)

df['title_str_distance_maxpool'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.query_str_vec), axis=1)
df['title_origin_str_distance_maxpool'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.query_origin_str_vec), axis=1)
df['title_str_cos_distance_maxpool'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.query_str_vec], metric='cosine')[0, 1], axis=1)
df['title_origin_str_cos_distance_maxpool'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.query_origin_str_vec], metric='cosine')[0, 1], axis=1)

Wall time: 4min 38s


In [16]:
def get_sent_vector(x):
    tmp_list = list(jieba.cut(x))
    vec = [0] * 300
    vec_num = 0
    for word in tmp_list:
        if word in w2v[0]:
            vec += w2v[0][word]
            vec_num += 1
        else:
            for letter in word:
                if letter in w2v[0]:
                    vec += w2v[0][letter]
                    vec_num += 1
    return np.array(vec) / (vec_num + 1)

def get_sent_dict_vector(x):
    vec = np.array([0] * 300)
    weight = sum(np.array(list(x.values())).astype(float))
    for k, v in x.items():
        vec_num = 0
        tmp_vec = [0] * 300
        tmp_list = list(jieba.cut(k))
        for word in tmp_list:
            if word in w2v[0]:
                tmp_vec += w2v[0][word]
                vec_num += 1
            else:
                for letter in word:
                    if letter in w2v[0]:
                        tmp_vec += w2v[0][letter]
                        vec_num += 1
        tmp_vec = np.array(tmp_vec) / (vec_num + 1)
        vec = vec + tmp_vec * float(v)
    return vec / (weight + 1)

In [None]:
%%time
df['query_dict_vec'] = df.query_prediction.apply(get_sent_dict_vector)
df['title_vec'] = df.title.apply(get_sent_vector)

df['title_query_dict_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.query_dict_vec), axis=1)
df['title_query_dict_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.query_dict_vec] ,metric='cosine')[0, 1], axis=1)

In [None]:
%%time
# df['title_vec'] = df.title.apply(get_sent_vector)
df['prefix_vec'] = df.prefix.apply(get_sent_vector)

df['title_prefix_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.prefix_vec), axis=1)
df['title_prefix_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.prefix_vec], metric='cosine')[0, 1], axis=1)

In [None]:
def get_str_set(x):
    tmp = []
    for i in list(jieba.cut(x)):
        if i not in tmp:
            tmp.append(i)
    return ''.join(tmp)

In [None]:
%%time
df['query_str_joint'] = df.query_str_joint.apply(get_str_set)
                                                  
df['query_set_str_vec'] = df.query_str_joint.apply(get_sent_vector)

df['title_query_str_distance'] = df.apply(lambda x: np.linalg.norm(x.title_vec - x.query_set_str_vec), axis=1)
df['title_query_str_cos_distance'] = df.apply(lambda x: pairwise_distances([x.title_vec, x.query_set_str_vec] ,metric='cosine')[0, 1], axis=1)

In [None]:
%%time
import re

df['prefix_has_symbol'] = df.prefix.apply(lambda x: 0 if re.search("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", x) == None else 1)
df['title_has_symbol'] = df.title.apply(lambda x: 0 if re.search("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", x) == None else 1)

In [20]:
df[[i for i in df.columns if 'distance' in i or 'symbol' in i]].to_csv('./feature/featurefull_dis1_testB.csv', index=None)