In [1]:
# 訊息辨識模型 - 資料前處理

In [2]:
import numpy
import pandas as pd

In [3]:
# 讀出 excel 中所有欄位資料
data_path = "/Users/PChomeIM/pywork/Dataset/im_ds.xlsx"
data_frame = pd.read_excel(data_path)

In [4]:
# question : 使用者問題
# flag : 問題分類
# flag = 0 : IM 問題
# flag = 1 : EC 問題
# flag = 2 : 物流 問題
data_frame[:2] # 顯示前 2 筆資料

Unnamed: 0,question,flag
0,如何將官方帳號加為好友,0
1,請問如何領取50元積點,0


In [5]:
# 共 2072 筆資料、2 個欄位
data_frame.shape

(2072, 2)

In [6]:
# 查看所有欄位有幾筆資料含有 null 值
data_frame.isnull().sum()

question    0
flag        0
dtype: int64

In [7]:
# 將 df 轉換成 nd_array
nd_array = data_frame.values

In [8]:
# 取出 features 和 labels，並轉成標準 list
texts = nd_array[:,0].tolist() # 取第 0 欄作為 features
labels = nd_array[:,1].tolist() # 取第 1 欄作為 labels

In [9]:
# raw data
texts[2053]

'感恩買1送1 讚嘆買1送1 😍\n9/21-9/30超優惠5折下殺\n人氣超夯商品 一次購足\n搶券滿$1000 再現折$100\n👉https://goo.gl/1Y7pH1 \n\n✔KAFEN蝸牛極致洗髮精/護髮素↘2入$945\nhttps://goo.gl/RXEHjq\n✔IS愛思 皇家騎士II通話平板↘2入$5980\nhttps://goo.gl/nvYzve\n✔美國PRIME板腱沙朗↘2入$649\nhttps://goo.gl/2He6Vj\n✔途訊K068藍牙掌上KTV↘2入$1988\nhttps://goo.gl/8VaJG3\n✔ Diadora男女通用慢跑鞋↘2雙$1880\nhttps://goo.gl/4sKcQ1\n✔億光LED 9.5W廣角燈泡↘2入$188\nhttps://goo.gl/E75TyX\n✔點我看更多買1送1\nhttps://goo.gl/MGF7wa\n\n#PChome商店街\n#四大超商取貨0元免運\n#限時買1送1\n#滿1000現折100\n11:00\n怎樣可折\n還有嗎'

In [10]:
# 將訊息中的中文字過濾出來
import re
def filter_out_chinese_letters(sentences):
    # \u4E00-\u9FA5 Chinese unicode range
    # a-zA-Z English range
    filter_re = re.compile('[^\u4E00-\u9FA5]') # non Chinese
    i = 0
    for sentence in sentences:
        sentences[i] = filter_re.sub('', str(sentence)) # remove all non Chinese letters                   
        i = i + 1
    return sentences

texts = filter_out_chinese_letters(texts)

In [11]:
# processed data
texts[2053]

'感恩買送讚嘆買送超優惠折下殺人氣超夯商品一次購足搶券滿再現折蝸牛極致洗髮精護髮素入愛思皇家騎士通話平板入美國板腱沙朗入途訊藍牙掌上入男女通用慢跑鞋雙億光廣角燈泡入點我看更多買送商店街四大超商取貨元免運限時買送滿現折怎樣可折還有嗎'

In [12]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=800, # 將最常出現的 800 字作為 Token
                  char_level=True) # 以單個字分 Token
tokenizer.fit_on_texts(texts)
tokenizer.word_index

Using TensorFlow backend.


{'門': 401,
 '完': 198,
 '律': 1146,
 '進': 134,
 '液': 902,
 '培': 819,
 '篷': 1330,
 '微': 1346,
 '喇': 891,
 '寸': 378,
 '或': 361,
 '添': 1007,
 '蹄': 1270,
 '檢': 983,
 '鑰': 1396,
 '可': 15,
 '到': 6,
 '淨': 1301,
 '瑛': 1174,
 '神': 1098,
 '么': 624,
 '果': 196,
 '先': 284,
 '堆': 1031,
 '忠': 929,
 '兒': 405,
 '破': 562,
 '性': 778,
 '攝': 985,
 '區': 187,
 '將': 383,
 '關': 186,
 '驅': 945,
 '有': 5,
 '匙': 1397,
 '測': 516,
 '骨': 1297,
 '損': 654,
 '露': 1011,
 '成': 97,
 '久': 146,
 '叭': 892,
 '什': 36,
 '夠': 462,
 '芝': 1392,
 '併': 946,
 '紀': 426,
 '射': 1351,
 '桌': 498,
 '耽': 1153,
 '倒': 894,
 '街': 108,
 '孟': 1210,
 '掛': 616,
 '耳': 660,
 '廚': 1056,
 '瓣': 1339,
 '良': 838,
 '景': 1016,
 '安': 237,
 '量': 597,
 '剩': 1254,
 '舒': 679,
 '減': 1220,
 '疏': 1357,
 '速': 574,
 '抗': 1070,
 '固': 526,
 '障': 594,
 '芳': 721,
 '桶': 908,
 '改': 126,
 '言': 647,
 '務': 432,
 '路': 137,
 '礙': 1233,
 '格': 603,
 '鍋': 664,
 '簡': 349,
 '聰': 1166,
 '建': 688,
 '閱': 1010,
 '圈': 1171,
 '廣': 357,
 '啊': 217,
 '灯': 1026,
 '回': 80,
 '杖': 1276,
 '郵': 403,

In [13]:
# Accordding to the tokenizer, convert the texts to the sequences.
sequences = tokenizer.texts_to_sequences(texts)
sequences[0]

[22, 12, 383, 79, 71, 31, 17, 39, 41, 19, 69]

In [14]:
# For training model, the length of the sentences must be same.
from keras.preprocessing import sequence
train_features = sequence.pad_sequences(sequences, maxlen=20)
train_features[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,  22,  12, 383,  79,
        71,  31,  17,  39,  41,  19,  69], dtype=int32)

In [15]:
# convert labels to onehot format
from keras.utils import np_utils
train_labels = np_utils.to_categorical(labels)
train_labels[0]

array([ 1.,  0.,  0.])

In [16]:
# save token
import pickle
tokenizer_path = "/Users/PChomeIM/pywork/SaveModel/Tokenizer.pickle"
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)