In [75]:
#將資料讀取
import pandas as pd
TRAIN_CSV_PATH = 'train.csv'
train = pd.read_csv(TRAIN_CSV_PATH, index_col=0,encoding='utf8')
cols = ['title1_zh', 'title2_zh', 'label']
train = train.loc[:, cols]
train = train.dropna()#去除空值

In [85]:
import jieba.posseg as pseg

def jieba_tokenizer(text):
    words = pseg.cut(text)
    return ' '.join([word for word, flag in words if flag != 'x'])


#要做斷詞，將其中一列分出來
train['title1_tokenizer'] = train.loc[:,'title1_zh'].apply(jieba_tokenizer)
train['title2_tokenizer'] = train.loc[:,'title2_zh'].apply(jieba_tokenizer)
train.head(3)


Unnamed: 0_level_0,title1_zh,title2_zh,label,title1_tokenizer,title2_tokenizer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,unrelated,2017 养老保险 又 新增 两项 农村 老人 人人 可 申领 你 领到 了 吗,警方 辟谣 鸟巢 大会 每人 领 5 万 仍 有 老人 坚持 进京
3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,unrelated,你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港,深圳 GDP 首 超 香港 深圳 统计局 辟谣 只是 差距 在 缩小
1,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,unrelated,你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港,GDP 首 超 香港 深圳 澄清 还 差 一点点


In [86]:
#斷詞太久了，先將檔案存起來
train.to_csv('output.csv', encoding = 'utf-8-sig') 

In [88]:
from tensorflow import keras
MAX_NUM_WORDS = 10000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)

#製作字典
corpus1 = train.title1_tokenizer#新聞A
corpus2 = train.title2_tokenizer#新聞B
corpus = pd.concat([corpus1,corpus2])#合再一起

#丟進分詞器
tokenizer.fit_on_texts(corpus)
Anews_train = tokenizer.texts_to_sequences(corpus1)
Bnews_train = tokenizer.texts_to_sequences(corpus2)


[[217, 1268, 32, 1178, 5967, 25, 489, 2877, 116, 5559, 4, 1850, 2, 13]]

In [92]:
#檢查中文數字轉換
print(Anews_train[:1])
for seq in Anews_train[:1]:
    print([tokenizer.index_word[idx] for idx in seq])

[[217, 1268, 32, 1178, 5967, 25, 489, 2877, 116, 5559, 4, 1850, 2, 13]]
['2017', '养老保险', '又', '新增', '两项', '农村', '老人', '人人', '可', '申领', '你', '领到', '了', '吗']


In [97]:
#因為句子的長度不同，所以要做padding
max_len = 15#設定最大長度，不夠補0超過去掉。
Anews_train = keras.preprocessing.sequence.pad_sequences(Anews_train,max_len)
Bnews_train = keras.preprocessing.sequence.pad_sequences(Bnews_train,max_len)

In [132]:
#label
import numpy as np
label_to_index = {'unrelated': 0, 'agreed': 1, 'disagreed': 2}
y_train = train.label.apply(lambda x: label_to_index[x])
y_train = np.asarray(y_train).astype('float32')
y_train =  keras.utils.to_categorical(y_train)
y_train[:5]

#製作訓練、驗證資料集
from sklearn.model_selection import train_test_split



x1_train, x1_val, x2_train, x2_val, y_train, y_val = train_test_split(Anews_train, Bnews_train, y_train, test_size=0.1, random_state=42)

In [134]:
from tensorflow.keras import Input
from tensorflow.keras.layers import Embedding, LSTM, concatenate, Dense
from tensorflow.keras.models import Model
#tensorflow version 2.3


#Dataset
seq_lenght = 15
anews_input = Input(shape=(seq_lenght, ),dtype = 'int32')
bnews_input = Input(shape=(seq_lenght, ),dtype = 'int32')

#Embedding
embedding_layer = Embedding(10000,256)
anews_mebedding = embedding_layer(anews_input)
bnews_mebedding = embedding_layer(bnews_input)

#LSTM
share_Lstm = LSTM(128)
anews_lstm = share_Lstm(anews_mebedding)
bnews_lstm = share_Lstm(bnews_mebedding)

#Concat
concat = concatenate([anews_lstm,bnews_lstm], axis = -1)

#Dense
dense = Dense(units=3, activation = 'softmax')
predictions = dense(concat)

#Model
model = Model(inputs = [anews_input,bnews_input], outputs = predictions)

print(model.summary())

model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 15, 256)      2560000     input_9[0][0]                    
                                                                 input_10[0][0]                   
__________________________________________________________________________________________________
lstm_4 (LSTM)                   (None, 128)          197120      embedding_4[0][0]     

In [151]:
#Fit model
result = model.fit(x = [x1_train, x2_train], y = y_train, batch_size=512, epochs=10, validation_data=([x1_val, x2_val], y_val),shuffle = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [136]:
model.save('lstm_fakenews')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: lstm_fakenews/assets


In [145]:
#testDataset
import pandas as pd
test = pd.read_csv('test.csv', index_col=0,encoding='utf8')
test = test.dropna()
import jieba.posseg as pseg

def jieba_tokenizer(text):
    words = pseg.cut(text)
    return ' '.join([word for word, flag in words if flag != 'x'])

#seg
test['title1_tokenizer'] = test.title1_zh.apply(jieba_tokenizer)
test['title2_tokenizer'] = test.title2_zh.apply(jieba_tokenizer)

#seq2int
x1_test = tokenizer.sequences_to_texts(test['title1_tokenizer'])
x2_test = tokenizer.sequences_to_texts(test['title2_tokenizer'])

max_len = 15
x1_test = keras.preprocessing.sequence.pad_sequences(x1_test,maxlen=max_len)
x2_test = keras.preprocessing.sequence.pad_sequences(x2_test,maxlen=max_len)

predictions = model.predict([x1_test,x2_test])