In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from collections import OrderedDict
import os
from gensim.models import Word2Vec

In [2]:
df_train_data = pd.read_csv(
    './datasets/trainingset.csv'
)
df_val_data = pd.read_csv(
    './datasets/validationset.csv'
)
df_test_data = pd.read_csv(
    './datasets/testa.csv'
)

In [3]:
# 定义 aspect 关键词
# 这些关键词由20个二分类的 LightGBM 根据特征重要性得到
subjects = [
    '地铁站 地铁 地理位置 位置 公交车 公交车站 公交站',
    '百货 商圈 商场 广场 购物中心 城 商业街',
    '容易 位置 醒目 找到 找 地理位置 显眼',
    '小时 排队 等 排 排号 队 号',
    '态度 服务员 热情 服务态度 老板 服务 服务生',
    '开车 停车费 停车位 停 停车场 车位 泊车',
    '很快 催 慢 速度 分钟 上菜 等',
    '小贵 不贵 价位 原价 块钱 价格 性价比',
    '不划算 物有所值 不值 物美价廉 超值 性价比 实惠',
    '活动 团 霸王餐 代金券 团购 优惠 券',
    '装修 布置 灯光 古色古香 装饰 优雅 情调',
    '安静 环境 装修 氛围 嘈杂 吵闹 音乐',
    '大 宽敞 空间 面积 装修 拥挤 店面',
    '整洁 干净 环境 卫生 苍蝇 不错 脏',
    '吃不完 一份 量 量足 个头 好大 少',
    '入味 吃 不错 味道 好吃 口味 好喝',
    '造型 颜色 精致 卖相 好看 色香味 食欲',
    '推荐 强烈推荐 值得 强推 一试 极力推荐 菜品',
    '好 满意 纪念品 内地 之 肠 灌',
    '还会 机会 再 不会 来 值得 推荐',
]

In [4]:
subjects_eng = [
    'location_traffic_convenience',
    'location_distance_from_business_district',
    'location_easy_to_find',
    'service_wait_time',
    'service_waiters_attitude',
    'service_parking_convenience',
    'service_serving_speed',
    'price_level',
    'price_cost_effective',
    'price_discount',
    'environment_decoration',
    'environment_noise',
    'environment_space',
    'environment_cleaness',
    'dish_portion',
    'dish_taste',
    'dish_look',
    'dish_recommendation',
    'others_overall_experience',
    'others_willing_to_consume_again',
]

In [5]:
subjects_dict = OrderedDict(zip(subjects_eng, subjects))
max_aspect_len = len(subjects[0].split(' '))

In [6]:
w2v_path = './saved/word2vec.model'
w2v = Word2Vec.load(w2v_path)

In [8]:
def get_word2id(data_path,
                all_subjects,
                train_fname,
                val_fname,
                test_fname,
                w2v,
                pre_processed,
                save_fname,
                suffix='_words_list.txt'):
    '''构造 word id 映射'''
    save_fname = data_path + save_fname + '.txt'
    print(save_fname)
    word2id = {}
    max_len, max_aspect_len = 0, 0
    
    word2id['<pad>'] = 0
    for s in all_subjects:
        crt_len = 0
        for word in s.split(' '):
            if word in w2v:
                crt_len += 1
                if word not in word2id:
                    word2id[word] = len(word2id)
        max_aspect_len = max(crt_len, max_aspect_len)

    for file_path in [train_fname, val_fname, test_fname]:
        file_path = data_path + file_path + suffix
        with open(file_path) as f_r:
            for line in f_r:
                crt_len = 0
                for word in line.strip().split(' '):
                    if word in w2v:
                        crt_len += 1
                        if word not in word2id:
                            word2id[word] = len(word2id)
                max_len = max(crt_len, max_len)

    with open(save_fname, 'w') as fsave:
        fsave.write('%d %d\n' % (max_len, max_aspect_len))
        for item in sorted(word2id.items(), key=lambda x: x[1]):
            fsave.write(item[0] + ' ' + str(item[1]) + '\n')

    return word2id, max_len, max_aspect_len


word2id, max_context_len, max_aspect_len = get_word2id(
    './datasets/',
    subjects,
    'trainingset',
    'validationset',
    'testa',
    w2v,
    pre_processed=False,
    save_fname='vocab',
    suffix='_words_list.txt',
)

./datasets/vocab.txt




In [9]:
max_context_len

1113

In [10]:
max_aspect_len

7

In [11]:
def build_nn_context(data_path,
                     file_name,
                     word2id,
                     pre_processed,
                     context_max_len,
                     suffix='_words_list.txt'):
    if pre_processed:
        pass
    else:
        contexts, context_lens = [], []
        file_path = data_path + file_name + suffix
        with open(file_path) as f_r:
            for line in f_r:
                words = [
                    word2id[w] for w in filter(lambda x: x in word2id,
                                               line.strip().split(' '))
                ]
                if len(words) < context_max_len:
                    crt_content_lens = len(words)
                    words = words + [0] * (context_max_len - len(words))
                else:
                    crt_content_lens = context_max_len
                    words = words[:context_max_len]
                contexts.append(words)
                context_lens.append(crt_content_lens)
        return np.asarray(contexts), np.asarray(context_lens)


print('对评论编码')
train_context, train_context_lens = build_nn_context(
    './datasets/',
    'trainingset',
    word2id,
    pre_processed=False,
    context_max_len=max_context_len,
    suffix='_words_list.txt')
val_context, val_context_lens = build_nn_context(
    './datasets/',
    'validationset',
    word2id,
    pre_processed=False,
    context_max_len=max_context_len,
    suffix='_words_list.txt')
test_context, test_context_lens = build_nn_context(
    './datasets/',
    'testa',
    word2id,
    pre_processed=False,
    context_max_len=max_context_len,
    suffix='_words_list.txt')
print(train_context.shape, val_context.shape, test_context.shape)

对评论编码
(105000, 1113) (15000, 1113) (15000, 1113)


In [32]:
# # 将处理好的数据保存
# with open('./datasets/datasets.npy','wb') as f:
#     np.save(f, train_context)
#     np.save(f, val_context)
#     np.save(f, test_context)

In [14]:
print('计算不同类别、标签的权重')
cost_w = []
for col in subjects_eng:
    crt_w = [
        item[1]
        for item in sorted(dict(df_train_data[col].value_counts()).items(),
                           key=lambda x: x[0])
    ]
    crt_w = np.exp(1 * np.min(crt_w) / np.asarray(crt_w))
    crt_w /= sum(crt_w)
    cost_w.append(crt_w)
print(np.asarray(cost_w))

计算不同类别、标签的权重
[[0.14484863 0.31622786 0.38871107 0.15021244]
 [0.13910733 0.34324093 0.37573207 0.14191966]
 [0.15255294 0.27549624 0.40215741 0.16979341]
 [0.13548401 0.35643312 0.26204717 0.2460357 ]
 [0.17096094 0.37867264 0.27852493 0.17184149]
 [0.13316494 0.35713949 0.32596183 0.18373374]
 [0.1553022  0.23325269 0.41098311 0.200462  ]
 [0.16073924 0.34567436 0.2118392  0.28174719]
 [0.13667641 0.35784223 0.35080672 0.15467464]
 [0.17319994 0.45839697 0.18525573 0.18314737]
 [0.17149036 0.44802732 0.20647965 0.17400267]
 [0.15362657 0.40046625 0.27809964 0.16780755]
 [0.15764398 0.39271683 0.26750934 0.18212985]
 [0.14143206 0.35926299 0.34503813 0.15426681]
 [0.15001211 0.32786203 0.34505399 0.17707187]
 [0.32481704 0.37342225 0.1531231  0.14863762]
 [0.15119705 0.39415902 0.28615697 0.16848696]
 [0.1408024  0.32955758 0.3738688  0.15577121]
 [0.44597103 0.20542989 0.17951996 0.16907911]
 [0.15211315 0.29313344 0.39552705 0.15922636]]


In [34]:
def build_labels_ws(data_path, file_name, all_subjects, cost_w):
    data = pd.read_csv(data_path + file_name + '.csv',
                       usecols=all_subjects).values
    ans, ws = [], []
    for items in data:
        crt_ans, crt_ws = [], []
        for i, c in zip(items, cost_w):
            tmp = [0, 0, 0, 0] # 四分类
            tmp[i + 2] = 1
            crt_ans.append(tmp)
            crt_ws.append(c[i + 2])
        ans.append(crt_ans)  # 20 x 4
        ws.append(crt_ws) # 20
    return np.asarray(ans), np.asarray(ws)


print('构造每个sample的label，权重')
train_labels, train_ws = build_labels_ws(
    './datasets/',
    'trainingset',
    subjects_eng, cost_w)
val_labels, val_ws = build_labels_ws(
    './datasets/',
    'validationset',
    subjects_eng, cost_w)
print(train_labels.shape, train_ws.shape, val_labels.shape, val_ws.shape)

构造每个sample的label，权重
(105000, 20, 4) (105000, 20) (15000, 20, 4) (15000, 20)


In [38]:
# # 将处理好的数据保存
# with open('./datasets/labels.npy','wb') as f:
#     np.save(f, train_labels)
#     np.save(f, train_ws)
#     np.save(f, val_labels)
#     np.save(f, val_ws)  

In [17]:
train_labels[0], train_ws[0]

(array([[1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0]]),
 array([0.14484863, 0.13910733, 0.15255294, 0.13548401, 0.17184149,
        0.13316494, 0.1553022 , 0.16073924, 0.13667641, 0.18314737,
        0.17149036, 0.15362657, 0.15764398, 0.14143206, 0.15001211,
        0.32481704, 0.16848696, 0.1408024 , 0.16907911, 0.15211315]))

In [18]:
def load_word_embeddings(word2id, com_w2v, word_char_emb=False):
    com_w2v_embedding_dim = com_w2v.vector_size
    word2vec = {}
    fnl_word2vec = np.random.uniform(-0.01, 0.01,
                                     [len(word2id), com_w2v_embedding_dim])

    contain_w_cnt = 0
    for w, w_id in word2id.items():
        if w in com_w2v:
            contain_w_cnt += 1
            crt_word_vec = com_w2v[w]
        elif w in word2vec:
            crt_word_vec = word2vec[w]
        else:
            tmp_word_vec = np.random.uniform(-0.01, 0.01,
                                             [com_w2v_embedding_dim])
            word2vec[w] = tmp_word_vec
            crt_word_vec = tmp_word_vec
        fnl_word2vec[word2id[w]] = crt_word_vec

    print(len(word2vec))
    print('contain rate:%d/%d' % (contain_w_cnt, len(word2id)))
    fnl_word2vec[word2id['<pad>'], :] = 0
    return fnl_word2vec, com_w2v_embedding_dim

print('根据word2id构造对应的词向量')
embedding_matrix, embedding_dim = load_word_embeddings(word2id, w2v)
embedding_matrix = embedding_matrix.astype(np.float32)
print(embedding_matrix.shape)

根据word2id构造对应的词向量


  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


1
contain rate:127527/127528
(127528, 100)


In [39]:
# np.save('./saved/word_embed', embedding_matrix)

In [19]:
def build_aspect(word2id, all_subjects, aspect_max_len):
    subject_rst, subject_lens = [], []
    for subject in all_subjects:
        words = []
        for w in subject.split(' '):
            words.append(word2id[w])
        if len(words) > aspect_max_len:
            subject_lens.append(aspect_max_len)
            words = words[:aspect_max_len]
        else:
            subject_lens.append(len(words))
            words = words + [0] * (aspect_max_len - len(words))
        subject_rst.append(words)
    return np.asarray(subject_rst), np.asarray(subject_lens)


print('对aspect编码')
aspect_input, aspect_lens = build_aspect(word2id, subjects, max_aspect_len)
print(aspect_input.shape, aspect_lens.shape)

对aspect编码
(20, 7) (20,)


In [20]:
aspect_input

# 每个 aspect 由 7 个单词组成，每个单词由 7 个字符组成

array([[  1,   2,   3,   4,   5,   6,   7],
       [  8,   9,  10,  11,  12,  13,  14],
       [ 15,   4,  16,  17,  18,   3,  19],
       [ 20,  21,  22,  23,  24,  25,  26],
       [ 27,  28,  29,  30,  31,  32,  33],
       [ 34,  35,  36,  37,  38,  39,  40],
       [ 41,  42,  43,  44,  45,  46,  22],
       [ 47,  48,  49,  50,  51,  52,  53],
       [ 54,  55,  56,  57,  58,  53,  59],
       [ 60,  61,  62,  63,  64,  65,  66],
       [ 67,  68,  69,  70,  71,  72,  73],
       [ 74,  75,  67,  76,  77,  78,  79],
       [ 80,  81,  82,  83,  67,  84,  85],
       [ 86,  87,  75,  88,  89,  90,  91],
       [ 92,  93,  94,  95,  96,  97,  98],
       [ 99, 100,  90, 101, 102, 103, 104],
       [105, 106, 107, 108, 109, 110, 111],
       [112, 113, 114, 115, 116, 117, 118],
       [119, 120, 121, 122, 123, 124, 125],
       [126, 127, 128, 129, 130, 114, 112]])

In [56]:
# # 保存 aspect_input
# np.save("./datasets/aspects", aspect_input)

In [49]:
type(len(aspect_input))

int

In [50]:
type(w2v.vector_size)

int

In [51]:
aspect_embed = np.random.randn(-0.01, 0.01,
                               [len(aspect_input), w2v.vector_size])

TypeError: 'float' object cannot be interpreted as an integer

In [22]:
print('building char vector...')
max_char_len = 7
ch2v_path, ch2v_char2id_txt = './saved/char2vec.model', 'char2id'

building char vector...


In [23]:
def get_char2id(data_path,
                all_subjects,
                train_fname,
                val_fname,
                test_fname,
                w2v,
                pre_processed,
                save_fname,
                suffix='_char_list.txt'):
    '''构造 char id 映射'''
    save_fname = data_path + save_fname + '.txt'
    print(save_fname)
    word2id = {}
    max_len, max_aspect_len = 0, 0

    if pre_processed:
        crt_cnt = 0
        with open(save_fname) as f_r:
            for line in f_r:
                crt_cnt += 1
                if crt_cnt == 1:
                    max_len, max_aspect_len = line[:-1].split(' ')
                    max_len, max_aspect_len = int(max_len), int(max_aspect_len)
                else:
                    tmp = line[:-1].split(' ')
                    word2id[tmp[0]] = int(tmp[1])
    else:
        word2id['<pad>'] = 0
        for s in all_subjects:
            crt_len = 0
            for word in s:
                if len(word.strip()) == 0: continue
                if word in w2v:
                    crt_len += 1
                    if word not in word2id:
                        word2id[word] = len(word2id)
            max_aspect_len = max(crt_len, max_aspect_len)

        for file_path in [train_fname, val_fname, test_fname]:
            file_path = data_path + file_path + suffix
            with open(file_path) as f_r:
                for line in f_r:
                    crt_len = 0
                    for word in line.strip().split(' '):
                        if word in w2v:
                            crt_len += 1
                            if word not in word2id:
                                word2id[word] = len(word2id)
                    max_len = max(crt_len, max_len)

        with open(save_fname, 'w') as fsave:
            fsave.write('%d %d\n' % (max_len, max_aspect_len))
            for item in sorted(word2id.items(), key=lambda x: x[1]):
                fsave.write(item[0] + ' ' + str(item[1]) + '\n')
    return word2id, max_len, max_aspect_len


print('构造 char2id 映射')
ch2v = Word2Vec.load(ch2v_path)
print(len(ch2v.wv.vocab))
char2id, max_context_len_ch, max_aspect_len_ch = get_char2id(
    './datasets/',
    subjects,
    'trainingset',
    'validationset',
    'testa',
    ch2v,
    pre_processed=False,
    save_fname=ch2v_char2id_txt,
    suffix='_char_list.txt')
print(len(char2id), max_context_len_ch, max_aspect_len_ch)

构造 char2id 映射
6380
./datasets/char2id.txt




6381 1779 21


In [24]:
print('将每个词拆分为字并记录')
wid2char = {}
for word, w_id in word2id.items():
    add_item = []
    if len(word) > max_char_len:
        add_item = (
            np.asarray([char2id[c] if c in char2id else 0 for c in list(word)[:max_char_len]]),
            max_char_len)
    else:
        add_item = (
            np.asarray(
                [char2id[c] if c in char2id else 0 for c in list(word)] + [0] * (max_char_len - len(word))),
            len(word))
    wid2char[w_id] = add_item
wid2char[0] = (np.zeros(max_char_len), 0)

将每个词拆分为字并记录


In [25]:
wid2char

{0: (array([0., 0., 0., 0., 0., 0., 0.]), 0),
 1: (array([1, 2, 3, 0, 0, 0, 0]), 3),
 2: (array([1, 2, 0, 0, 0, 0, 0]), 2),
 3: (array([1, 4, 5, 6, 0, 0, 0]), 4),
 4: (array([5, 6, 0, 0, 0, 0, 0]), 2),
 5: (array([7, 8, 9, 0, 0, 0, 0]), 3),
 6: (array([7, 8, 9, 3, 0, 0, 0]), 4),
 7: (array([7, 8, 3, 0, 0, 0, 0]), 3),
 8: (array([10, 11,  0,  0,  0,  0,  0]), 2),
 9: (array([12, 13,  0,  0,  0,  0,  0]), 2),
 10: (array([12, 14,  0,  0,  0,  0,  0]), 2),
 11: (array([15, 14,  0,  0,  0,  0,  0]), 2),
 12: (array([16, 17, 18, 19,  0,  0,  0]), 4),
 13: (array([20,  0,  0,  0,  0,  0,  0]), 1),
 14: (array([12, 21, 22,  0,  0,  0,  0]), 3),
 15: (array([23, 24,  0,  0,  0,  0,  0]), 2),
 16: (array([25, 26,  0,  0,  0,  0,  0]), 2),
 17: (array([27, 28,  0,  0,  0,  0,  0]), 2),
 18: (array([27,  0,  0,  0,  0,  0,  0]), 1),
 19: (array([29, 30,  0,  0,  0,  0,  0]), 2),
 20: (array([31, 32,  0,  0,  0,  0,  0]), 2),
 21: (array([33, 34,  0,  0,  0,  0,  0]), 2),
 22: (array([35,  0,  0, 

In [26]:
print('对 aspect、评论 进行字编码')
aspect_input_ch = np.asarray([[wid2char[i][0] for i in asp] for asp in aspect_input])
aspect_input_ch_lens = np.asarray([[wid2char[i][1] for i in asp] for asp in aspect_input])
print(aspect_input_ch.shape, aspect_input_ch_lens.shape)

# 20个 aspect，每个 aspect 由 7 个单词组成，每个单词由 7 个字符组成

对 aspect、评论 进行字编码
(20, 7, 7) (20, 7)


In [27]:
train_context_ch = np.asarray([[wid2char[i][0] for i in asp] for asp in train_context])
train_context_ch_len = np.asarray([[wid2char[i][1] for i in asp] for asp in train_context])
print(train_context_ch.shape, train_context_ch_len.shape)

(105000, 1113, 7) (105000, 1113)


In [28]:
val_context_ch = np.asarray([[wid2char[i][0] for i in asp] for asp in val_context])
val_context_ch_len = np.asarray([[wid2char[i][1] for i in asp] for asp in val_context])
print(val_context_ch.shape, val_context_ch_len.shape)

(15000, 1113, 7) (15000, 1113)


In [29]:
test_context_ch = np.asarray([[wid2char[i][0] for i in asp] for asp in test_context])
test_context_ch_len = np.asarray([[wid2char[i][1] for i in asp] for asp in test_context])
print(test_context_ch.shape, test_context_ch_len.shape)

(15000, 1113, 7) (15000, 1113)


In [33]:
print('根据 char2id 构造对应的字向量')
embedding_matrix_ch, embedding_dim_ch = load_word_embeddings(char2id, ch2v)
embedding_matrix_ch = embedding_matrix_ch.astype(np.float32)
print(embedding_matrix_ch.shape)

根据 char2id 构造对应的字向量
1
contain rate:6380/6381
(6381, 100)


  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


In [35]:
print('loading feature...')
prefix = 'svd_tfidf_withP_80'
train_feas = np.load('./data/%s_train.npy' % prefix)
val_feas = np.load('./data/%s_val.npy' % prefix)
test_feas = np.load('./data/%s_test.npy' % prefix)
feature_eng_size = train_feas.shape[1]
print(feature_eng_size)

loading feature...
80
