In [1]:
from keras import Sequential
from keras.callbacks import Callback
from keras.layers import Embedding, np
from keras.utils import to_categorical
from keras_applications.densenet import layers
from keras_preprocessing import sequence
from keras_preprocessing.text import Tokenizer
from keras import backend as K

from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
import jieba

Using TensorFlow backend.


#### 1、读入数据

In [3]:
train_data = pd.read_csv('../data/trainingset/dealed_trainingset.csv')
val_data = pd.read_csv('../data/validationset/dealed_validationset.csv')
testa_data = pd.read_csv('../data/testa/dealed_testa.csv')

In [18]:
# 求每一个样本的最大长度
def get_maxlen(texts):
    maxlen = 0
    for line in texts:
        if maxlen < len(line):
            maxlen = len(line)
    return maxlen

# 利用keras的Tokenizer进行onehot，并调整未等长数组
def preprocess_data(texts):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    data_w = tokenizer.texts_to_sequences(texts)
    data_T = sequence.pad_sequences(data_w, maxlen=get_maxlen(texts))
    print(data_T.shape)
    return data_T

def get_texts_maxlen(args):
    maxlen = args[0]
    for i in range(0, len(args)):
        if args[i] > maxlen:
            maxlen = args[i]
    return maxlen

In [8]:
# 数据划分，重新划分为训练集，测试集和验证集
onehot_train_texts = preprocess_data(train_data['texts'])
onehot_val_texts = preprocess_data(val_data['texts'])
onehot_test_texts = preprocess_data(testa_data['texts'])

(105000, 5069)
(15000, 4887)
(15000, 4887)


In [22]:
lens = [onehot_train_texts.shape[1], onehot_test_texts.shape[1], onehot_test_texts.shape[1]]
maxlen = get_texts_maxlen(lens)
maxlen

5069

In [11]:
y_cols = ['location_traffic_convenience',
 'location_distance_from_business_district',
 'location_easy_to_find',
 'service_wait_time',
 'service_waiters_attitude',
 'service_parking_convenience',
 'service_serving_speed',
 'price_level',
 'price_cost_effective',
 'price_discount',
 'environment_decoration',
 'environment_noise',
 'environment_space',
 'environment_cleaness',
 'dish_portion',
 'dish_taste',
 'dish_look',
 'dish_recommendation',
 'others_overall_experience',
 'others_willing_to_consume_again']

#### 2、建立模型

In [23]:
def build_model():
    model = Sequential()
    embedding_dim = 128
    max_words = 50000
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(64, 3, activation='relu'))
    model.add(layers.MaxPooling1D(5))
    model.add(layers.Conv1D(64, 3, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(4, activation='softmax'))
    return model

In [30]:
class Metrics(Callback):    
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
#         val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_predict = np.argmax(np.asarray(self.model.predict(self.validation_data[0])), axis=1)
#         val_targ = self.validation_data[1]
        val_targ = np.argmax(self.validation_data[1], axis=1)
        _val_f1 = f1_score(val_targ, val_predict, average='macro')
#         _val_recall = recall_score(val_targ, val_predict)
#         _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
#         self.val_recalls.append(_val_recall)
#         self.val_precisions.append(_val_precision)
#         print('— val_f1: %f — val_precision: %f — val_recall %f' %(_val_f1, _val_precision, _val_recall))
        print(' — val_f1:' ,_val_f1)
        return 

    
def train_CV_CNN(train_x, test_x, val_x, y_cols=y_cols, debug=True, folds=2):
    model = build_model()
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    F1_scores = 0
    F1_score = 0
    metrics = Metrics()
    if debug:
        y_cols = ['location_traffic_convenience']
    for index, col in enumerate(y_cols):
        train_y = train_data[col] + 2
        val_y = val_data[col] + 2
        y_val_pred = 0
        y_test_pred = 0
        result = {}
        for i in range(1):
            X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=100 * i)
            y_train_onehot = to_categorical(y_train)
            y_test_onehot = to_categorical(y_test)
            history = model.fit(X_train, y_train_onehot, epochs=5, batch_size=128, 
                                validation_data=(X_test, y_test_onehot),callbacks=[metrics])

            # 预测验证集和测试集y_test_pred
            y_val_pred = model.predict(val_x)
            y_test_pred += model.predict(test_x)

            y_val_pred = np.argmax(y_val_pred, axis=1)

            F1_score = f1_score(y_val_pred, val_y, average='macro')
            F1_scores += F1_score

            print(col, 'f1_score:', F1_score, 'ACC_score:', accuracy_score(y_val_pred, val_y))
        y_test_pred = np.argmax(y_test_pred, axis=1)
        result[col] = y_test_pred - 2
    print('all F1_score:', F1_scores / len(y_cols))
    df = pd.DataFrame(result)
    df.to_csv('../data/result/result.csv')
    return result

In [None]:
train_CV_CNN(train_x=onehot_train_texts, val_x=onehot_val_texts, test_x=onehot_test_texts)