In [1]:
import tensorflow as tf
from utils import jieba, get_stop_word_set, load_word_embeddings, get_word2id
import pandas as pd
import numpy as np
import os
import time

  from ._conv import register_converters as _register_converters
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.445 seconds.
Prefix dict has been built succesfully.


In [2]:
config = {
    'embedding_dim':64,
    'n_class': 4,
    'batch_size': 1024,
    'pre_processed': 0,
    'max_aspect_len': 1,
    'max_context_len': 90, 

    'n_epoch': 5,
    'n_hidden': 100,
    'learning_rate': 0.01,
    'l2_reg': 0.001,   
    'dropout': 0.5,
}

In [3]:
subjects = ['动力', '价格', '内饰', '配置', '安全性', '外观', '操控', '油耗', '空间', '舒适性']
word2id, max_len = get_word2id('../data/', subjects, False, 'train', 'test_public', filter_stop_ws=True)
print(len(word2id), max_len)
config['max_context_len'] = max_len

19325 127


In [4]:
def read_train_data(word2id, max_context_len, data_path, fname, all_aspect, pre_processed):
#     '''return aspects, contents, labels, aspect_lens, content_lens'''
    '''sentences, aspects, sentence_lens, sentence_locs, labels'''
    if pre_processed:
        pass
    else:
        stop_words = get_stop_word_set()
        aspects, contents, labels, content_lens, aspects_locs = [], [], [], [], []
        data = pd.read_csv(data_path + fname + '.csv')
        id2label = data.groupby('content_id').apply(lambda x:dict([(item[0], item[1]) for item in x[['subject', 'sentiment_value']].values])).to_dict()
        data = data.drop_duplicates('content_id')
        data.reset_index(drop=True, inplace=True)
        f_w = open(data_path + fname + '_info_without_stopword.txt', 'w')
        for item in data[['content_id', 'content']].values:
#             print(item[0])
            words = [word2id[x] for x in filter(lambda x:x not in stop_words and len(x.strip())>0, jieba.cut(item[1].strip()))]
            f_w.write(str(words) + '\n')
            if len(words) < max_context_len:
                crt_content_lens = len(words)
                words = words + [0] * (max_context_len-len(words))
            else:
                crt_content_lens = max_context_len
                words = words[:max_context_len]
            
            crt_subject_value = id2label[item[0]]
            f_w.write(str(crt_subject_value) + '\n')
            for suj in all_aspect:
                if suj in crt_subject_value:
                    contents.append(words)
                    aspects.append([word2id[suj]])
                    content_lens.append(crt_content_lens)
                    aspects_locs.append([1/crt_content_lens] * crt_content_lens + [0.] * (config['max_context_len'] - crt_content_lens))
                    crt_label = [0] * 4
                    crt_label[crt_subject_value[suj]+1] = 1
                    labels.append(crt_label)
                else:
                    contents.append(words)
                    aspects.append([word2id[suj]])
                    content_lens.append(crt_content_lens)
                    aspects_locs.append([1/crt_content_lens] * crt_content_lens + [0.] * (config['max_context_len'] - crt_content_lens))                    
                    labels.append([0] * 3 + [1])
    
    return np.asarray(contents), np.asarray(aspects), np.asarray(content_lens), np.asarray(aspects_locs), np.asarray(labels)
            
train = read_train_data(word2id, config['max_context_len'], '../data/', 'train',subjects, False)
print(len(train[0]))

82900


In [5]:
# def read_test_data(word2id, max_context_len, data_path, fname, all_aspect, pre_processed):
#     '''return aspects, contents, aspect_lens, content_lens'''
#     if pre_processed:
#         pass
#     else:
#         stop_words = get_stop_word_set()
#         aspects, contents, content_lens = [], [], []
#         data = pd.read_csv(data_path + fname + '.csv')
#         data = data.drop_duplicates('content_id')
#         data.reset_index(drop=True, inplace=True)
#         f_w = open(data_path + fname + '_info_without_stopword.txt', 'w')
#         for item in data[['content_id', 'content']].values:
#             words = [word2id[x] for x in filter(lambda x:x not in stop_words and len(x.strip())>0, jieba.cut(item[1].strip()))]
#             f_w.write(str(words) + '\n')
#             if len(words) < max_context_len:
#                 crt_content_lens = len(words)
#                 words = words + [0] * (max_context_len-len(words))
#             else:
#                 crt_content_lens = max_context_len
#                 words = words[:max_context_len]
            
#             for suj in all_aspect:
#                 contents.append(words)
#                 aspects.append([word2id[suj]])
#                 content_lens.append(crt_content_lens)
    
#     return np.asarray(aspects), np.asarray(contents), np.ones(len(aspects)), np.asarray(content_lens)
# test = read_test_data(word2id, config['max_context_len'], '../data/', 'test_public',subjects, False)
# print(len(test[0]))

In [5]:
config['embedding_matrix'], config['embedding_dim'] = load_word_embeddings(word2id, is_64_dim=True)

In [6]:
config['early_stop'] = 1
config['n_epoch'] = 20
config['learning_rate'] = 0.01
config['n_hidden'] = 70
config['batch_size'] = 1024
config['n_hop'] = 1

In [7]:
from sklearn.model_selection import train_test_split
train_flag, val_flag = train_test_split(range(int(len(train[0])/10)), test_size=0.2, random_state=2018)
train_flag = np.concatenate([[10*i + x for x in range(10)] for i in train_flag])
val_flag = np.concatenate([[10*i + x for x in range(10)] for i in val_flag])

In [13]:
from imp import reload
import RAM_model
reload(RAM_model)
from RAM_model import RAM

# sess.close()

tf.reset_default_graph()
sess = tf.Session();

# 1616,  6661,  1670, 72953
model = RAM(config, sess)
model.build_model()
model.run((train[0][train_flag], train[1][train_flag], train[2][train_flag], train[3][train_flag], train[4][train_flag]),
          (train[0][val_flag], train[1][val_flag], train[2][val_flag], train[3][val_flag], train[4][val_flag]))
# model.run((train[0][val_flag], train[1][val_flag], train[2][val_flag], train[3][val_flag], train[4][val_flag]),
#           (train[0][train_flag], train[1][train_flag], train[2][train_flag], train[3][train_flag], train[4][train_flag]))

Training ...
[0.24395107 0.24702409 0.25125816 0.25776672]
[0.19191936 0.2270088  0.20928296 0.3717889 ]
[0.00575636 0.05072621 0.00556204 0.93795544]
[0.1223937  0.2770491  0.12818941 0.47236776]
[0.14378841 0.3316381  0.15220656 0.37236688]
[0.12705709 0.35429093 0.13852076 0.3801312 ]
[0.14053017 0.3529563  0.14413314 0.36238042]
[0.08569728 0.36931267 0.0986122  0.44637778]
[0.04917261 0.2909787  0.06370202 0.59614664]
[0.04498645 0.26422012 0.05276005 0.6380334 ]
[0.08526476 0.2560477  0.10068561 0.55800194]
[0.10497721 0.28438407 0.09403244 0.51660633]
[0.07955251 0.29471582 0.09166328 0.5340684 ]
[0.08117111 0.2962359  0.08839735 0.53419566]
[0.07512515 0.2851086  0.08501395 0.55475235]
[0.12824601 0.23114802 0.13996454 0.50064147]
[0.05426405 0.2911977  0.05567986 0.5988584 ]
[0.05492367 0.27844402 0.04633115 0.6203012 ]
[0.10164973 0.2452534  0.10785585 0.54524106]
[0.04076622 0.29321396 0.02827565 0.6377442 ]
[0.10300025 0.28556693 0.0737497  0.5376832 ]
[0.08927833 0.2828143

KeyboardInterrupt: 