In [1]:
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
def read_vectors(path, topn):  # read top n word vectors, i.e. top is 10000
    lines_num, dim = 0, 0
    vectors = {}
    iw = []
    wi = {}
    with open(path, encoding='utf-8', errors='ignore') as f:
        first_line = True
        for line in f:
            if first_line:
                first_line = False
                dim = int(line.rstrip().split()[1])
                continue
            lines_num += 1
            tokens = line.rstrip().split(' ')
            vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
            iw.append(tokens[0])
            if topn != 0 and lines_num >= topn:
                break
    for i, w in enumerate(iw):
        wi[w] = i
    return vectors, iw, wi, dim

In [3]:
df = pd.read_csv('./feature/df_feature4_ctr_extra.csv', encoding='utf-8', usecols=['prefix', 'title', 'query_prediction'])

df.shape

(2100000, 3)

In [11]:
import jieba
import urllib

In [12]:
df['title'] = df.title.apply(lambda x: urllib.parse.unquote(x))
df['prefix'] = df.prefix.apply(lambda x: urllib.parse.unquote(x))
df['query_prediction'] = df.query_prediction.apply(lambda x: urllib.parse.unquote(x))

In [4]:
w2v = read_vectors('./data/new/merge_sgns_bigram_char300.txt', 0)

In [14]:
def get_outlaw_word(x):
    tmp_list = list(jieba.cut(x))
    outlaw_word = []
    outlaw_letter = []
    for word in tmp_list:
        if word not in w2v[0]:
            outlaw_word.append(word)
    return outlaw_word

def get_outlaw_letter(x):
    tmp_list = list(jieba.cut(x))
    outlaw_letter = []
    for word in tmp_list:
        if word not in w2v[0]:
            for letter in word:
                if letter not in w2v[0]:
                    outlaw_letter.append(letter)
    return outlaw_letter

def get_dict_outlaw_word(x):
    dic = eval(x)
    outlaw_word = []
    for key in dic.keys():
        tmp_list = list(jieba.cut(key))
        for word in tmp_list:
            if word not in w2v[0]:
                outlaw_word.append(word)
    return outlaw_word

def get_dict_outlaw_letter(x):
    dic = eval(x)
    outlaw_letter = []
    for key in dic.keys():
        tmp_list = list(jieba.cut(key))
        for word in tmp_list:
            if word not in w2v[0]:
                for letter in word:
                    if letter not in w2v[0]:
                        outlaw_letter.append(letter)
    return outlaw_letter

In [17]:
%%time
df['prefix_outlaw_word'] = df.prefix.apply(get_outlaw_word)
df['prefix_outlaw_letter'] = df.prefix.apply(get_outlaw_letter)
df['title_outlaw_word'] = df.title.apply(get_outlaw_word)
df['title_outlaw_letter'] = df.title.apply(get_outlaw_letter)

Wall time: 3min 2s


In [19]:
%%time
df['dict_outlaw_word'] = df.query_prediction.apply(get_dict_outlaw_word)
df['dict_outlaw_letter'] = df.query_prediction.apply(get_dict_outlaw_letter)

Wall time: 21min 1s


In [33]:
import operator
from functools import reduce

prefix_outlaw_letter_set = set(reduce(operator.add, df.prefix_outlaw_letter.tolist()))
title_outlaw_letter_set = set(reduce(operator.add, df.title_outlaw_letter.tolist()))
dict_outlaw_letter_set = set(reduce(operator.add, df.dict_outlaw_letter.tolist()))

In [34]:
prefix_outlaw_letter_set
title_outlaw_letter_set
dict_outlaw_letter_set

{' ',
 'م',
 'ن',
 'ە',
 '䗪',
 '叇',
 '嚊',
 '圐',
 '媣',
 '庎',
 '怣',
 '汖',
 '糄',
 '蘡',
 '蝜',
 '蝲',
 '髈',
 '녕',
 '랑',
 '사',
 '세',
 '안',
 '요',
 '청',
 '춘',
 '하',
 '해'}

{' ',
 'ئ',
 'د',
 'ز',
 'م',
 'ن',
 'ى',
 'ي',
 'ە',
 'จ',
 'ด',
 'ต',
 'ถ',
 'น',
 'บ',
 'ฟ',
 'ม',
 'ร',
 'ว',
 'อ',
 'ั',
 'ี',
 'ึ',
 'ู',
 'เ',
 'แ',
 'ไ',
 '่',
 '้',
 '\u3000',
 '䃠',
 '䗪',
 '厼',
 '叇',
 '嚊',
 '圐',
 '媣',
 '嫤',
 '庎',
 '怣',
 '朤',
 '殸',
 '汖',
 '瞼',
 '矆',
 '硂',
 '糄',
 '腅',
 '蔩',
 '蘡',
 '虋',
 '蝜',
 '豼',
 '錵',
 '髈',
 '鯗',
 '가',
 '각',
 '같',
 '건',
 '게',
 '고',
 '구',
 '급',
 '기',
 '까',
 '난',
 '네',
 '녀',
 '녕',
 '노',
 '농',
 '누',
 '니',
 '다',
 '대',
 '도',
 '동',
 '떨',
 '똑',
 '라',
 '랑',
 '래',
 '럽',
 '레',
 '로',
 '리',
 '림',
 '링',
 '마',
 '먼',
 '몽',
 '무',
 '물',
 '방',
 '뱀',
 '번',
 '벨',
 '병',
 '부',
 '비',
 '빈',
 '빛',
 '빠',
 '뻐',
 '뿐',
 '사',
 '삶',
 '삼',
 '상',
 '새',
 '세',
 '션',
 '소',
 '솔',
 '수',
 '순',
 '스',
 '슬',
 '시',
 '식',
 '신',
 '실',
 '심',
 '쓰',
 '씨',
 '아',
 '안',
 '암',
 '애',
 '야',
 '어',
 '없',
 '여',
 '옆',
 '예',
 '오',
 '요',
 '우',
 '운',
 '웃',
 '워',
 '원',
 '음',
 '응',
 '인',
 '있',
 '자',
 '잔',
 '잠',
 '장',
 '재',
 '조',
 '족',
 '주',
 '줄',
 '진',
 '질',
 '집',
 '짜',
 '청',
 '춘',
 '치',
 '케',
 '콘',
 '텅',

{' ',
 '\x91',
 '\x98',
 'ب',
 'د',
 'ز',
 'س',
 'ش',
 'ق',
 'ك',
 'ل',
 'م',
 'ن',
 'ى',
 'ي',
 'چ',
 'ۇ',
 'ە',
 '㖭',
 '㗊',
 '㚞',
 '㠭',
 '㡌',
 '㬵',
 '㵘',
 '䂳',
 '䓍',
 '䗪',
 '䨻',
 '䲜',
 '僺',
 '冸',
 '勥',
 '叇',
 '呍',
 '嚊',
 '圐',
 '奾',
 '妦',
 '姀',
 '婛',
 '媈',
 '媣',
 '嫝',
 '嫤',
 '嫴',
 '峓',
 '庎',
 '忈',
 '怣',
 '惗',
 '昮',
 '朤',
 '杋',
 '柛',
 '桋',
 '梚',
 '樰',
 '殅',
 '汃',
 '汖',
 '渂',
 '灪',
 '炏',
 '燜',
 '犾',
 '珻',
 '琾',
 '瑵',
 '璾',
 '瓃',
 '硣',
 '稥',
 '筣',
 '箉',
 '糄',
 '罳',
 '藌',
 '蘡',
 '蛦',
 '蜅',
 '蝜',
 '蟁',
 '詺',
 '豼',
 '郣',
 '髈',
 '鲏',
 '鴛',
 '鴦',
 '齫',
 '국',
 '녕',
 '랑',
 '사',
 '세',
 '안',
 '어',
 '요',
 '전',
 '중',
 '하',
 '해',
 '🐠',
 '🔑'}

In [20]:
df.head()

Unnamed: 0,prefix,query_prediction,title,prefix_outlaw_word,prefix_outlaw_letter,title_outlaw_word,title_outlaw_letter,dict_outlaw_word,dict_outlaw_letter
0,小品,"{'小品大全': '0.198', '小品搞笑大全': '0.066', '小品演员': '...",小品,[],[],[],[],[],[]
1,1368,"{'13685367892': '0.124', '1368年': '0.086', '13...","HCG大于1368,正常吗",[],[],[],[],"[13685367892, 就够, 13688cc, 13688478100, 13688c...",[]
2,1368,"{'13685367892': '0.124', '1368年': '0.086', '13...",1368年,[],[],[],[],"[13685367892, 就够, 13688cc, 13688478100, 13688c...",[]
3,银耳,"{'银耳红枣汤': '0.114', '银耳汤的做法': '0.059', '银耳的功效':...",银耳红枣汤的做法,[],[],[],[],[],[]
4,月经量少,"{'月经量少是什么原因': '0.569', '月经量少怎么办': '0.040', '月经...",月经量少怎么调理,[量少],[],[量少],[],"[量少, 量少, 量少, 量少, 量少, 量少, 量少, 量少, 量少, 量少]",[]


## pre-train

In [3]:
w2v = read_vectors('C:/Users/ZERO/KaggleWork/kaggle/w2v/merge_sgns_bigram_char300.txt', 0)

In [13]:
def read_dictionary(path):
    vectors = []
    with open(path, encoding='utf-8', errors='ignore') as f:
        for line in f:
            word = line.split('\t')[0]
            if (word != 'UNK') & (word != 'PAD'):
                vectors.append(word)
    return vectors

In [14]:
dictionary = read_dictionary('./model/rnn/rnn/output/dictionary/words')

In [None]:
with open(os.path.join(path, tag), 'w', encoding="utf-8") as of:
    for _token, _id in six.iteritems(self.token_to_id[tag]):
        of.write("%s\t%d\t%d\n" % (_token, _id, self.token_count[tag][_token]))

In [None]:
with open('./model/rnn/rnn/output/dictionary/dict_char300', 'w', encoding='utf-8') as f:
    for i in tqdm_notebook(dictionary):
        if i in w2v[0]:
            vector = '\t'.join(np.array(w2v[0][i]).astype(str))
            f.write('%s\t%s\n' % (_token, _id, self.token_count[tag][_token]))

In [15]:
dictionary

['小品',
 '大全',
 '搞笑',
 '演员',
 '剧本',
 '幽默',
 '相亲',
 '视频',
 '宋',
 '小宝',
 '不',
 '差钱',
 'nga',
 'NGA',
 '玩家',
 '社区',
 '阴阳师',
 '魔兽',
 '世界',
 '论坛',
 'ngauhung',
 '纯音乐',
 '王者',
 '荣耀',
 '原版',
 '快',
 '穿',
 '之',
 '男神',
 '攻略',
 '跟着',
 '炮灰',
 '跑',
 '守则',
 '不要',
 'h',
 '宝典',
 '是',
 '用来',
 '撩',
 '的',
 '手册',
 '组团',
 '来袭',
 '洗白',
 '勾勾',
 '来',
 '清风',
 'dj',
 '网',
 '音乐网',
 '音乐',
 '免费',
 '网站',
 'dj2018',
 '粤语',
 '串烧',
 '网页',
 '播放器',
 '乳腺',
 '增生',
 '增生症',
 '症状',
 '表现',
 '怎么',
 '治',
 '最好',
 '吃',
 '什么',
 '药',
 '怎么办',
 '不能',
 '结节',
 '严重',
 '吗',
 '禁忌',
 '冶疗',
 '更',
 '女性',
 '更年期',
 '综合症',
 '多',
 '推荐',
 '有',
 '哪些',
 '组词',
 '调理',
 '健康',
 '丰胸',
 '年龄',
 '更新',
 '补偿',
 '宝盒',
 '汽车',
 '家',
 '之家',
 '2018',
 '最新',
 '报价',
 '时刻表',
 '汽车票',
 '查询',
 '标志',
 '官网',
 '汽车报价',
 '农机',
 '通',
 '补贴',
 '价格表',
 '360',
 '通网',
 '市场',
 '农机配件',
 '价格',
 '银耳',
 '冰糖',
 '银耳汤',
 '做法',
 '红枣汤',
 '功效',
 '莲子',
 '羹',
 '莲子汤',
 '为什么',
 '天天',
 '袁阔成',
 '评书',
 '代表作',
 '丛书',
 '•',
 '春秋',
 '五霸',
 '三国演义',
 '365',
 '回',
 ')',
 '封神演义',
 '全',
 '200',
 '水泊梁山',
 '