https://leemeng.tw/shortest-path-to-the-nlp-world-a-gentle-guide-of-natural-language-processing-and-deep-learning-for-everyone.html#%E9%80%B2%E8%A1%8C%E9%A0%90%E6%B8%AC%E4%B8%A6%E6%8F%90%E4%BA%A4%E7%B5%90%E6%9E%9C_1

In [6]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, RMSprop, SGD

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import jieba.posseg as pseg
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

import threading

In [7]:
# 參數設定
TRAIN_DATA = pd.read_csv('train.csv', index_col=0)
TEST_DATA = pd.read_csv('test.csv', index_col=0)

# 序列最長的長度
MAX_SEQUENCE_LENGTH = 27

# 在語料庫裡有多少詞彙
MAX_NUM_WORDS = 20000

# 一個詞向量的維度
NUM_EMBEDDING_DIM = 128

# Train/Valid ratio
VALIDATION_SPLIT = 0.2

# 基本參數設置，有幾個分類
NUM_CLASSES = 30

# LSTM 輸出的向量維度
NUM_LSTM_UNITS = 128

## 使用 Jieba 新聞標題切割成一個個有意義的詞彙

In [8]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [9]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def jieba_tokenizer(text):
    words = pseg.cut(text)
    return ' '.join([word for word, flag in words if flag != 'x'])

In [22]:
TRAIN_DATA.iloc[:, [0, 1]].head(3)

Unnamed: 0_level_0,headline,short_description
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Petition Wants A Statue Of Missy Elliott To Re...,"""Together we can put white supremacy down, fli..."
1,This Law School Created A Criminal Justice Cla...,Long live Omar's code!
2,Mom Of Racist-Ranting Alabama Student Says She...,Sorority sister Harley Barber was expelled aft...


In [11]:
def task1(TRAIN_DATA):

    TRAIN_DATA['headline_tokenized'] = TRAIN_DATA.loc[:, 'headline'].apply(remove_between_square_brackets, remove_special_characters)
    TRAIN_DATA['headline_tokenized'] = TRAIN_DATA['headline_tokenized'].apply(simple_stemmer, remove_stopwords)
    TRAIN_DATA['headline_tokenized'] = TRAIN_DATA['headline_tokenized'].apply(jieba_tokenizer)

    return TRAIN_DATA['headline_tokenized']

def task2(TRAIN_DATA):

    TRAIN_DATA['short_description_tokenized'] = TRAIN_DATA.loc[:, 'short_description'].apply(remove_between_square_brackets, remove_special_characters)
    TRAIN_DATA['short_description_tokenized'] = TRAIN_DATA['short_description_tokenized'].apply(simple_stemmer, remove_stopwords)
    TRAIN_DATA['short_description_tokenized'] = TRAIN_DATA['short_description_tokenized'].apply(jieba_tokenizer)

    return TRAIN_DATA['short_description_tokenized']

In [12]:
a = threading.Thread(target=task1(TRAIN_DATA))  # 建立新的執行緒
b = threading.Thread(target=task2(TRAIN_DATA))  # 建立新的執行緒

a.start()
b.start()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\liu76\AppData\Local\Temp\jieba.cache
Loading model cost 0.632 seconds.
Prefix dict has been built successfully.
Exception in thread Thread-3:
Traceback (most recent call last):
  File "c:\Users\liu76\.conda\envs\tensorflow\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\liu76\.conda\envs\tensorflow\lib\threading.py", line 953, in run
Exception in thread Thread-4:
Traceback (most recent call last):
  File "c:\Users\liu76\.conda\envs\tensorflow\lib\threading.py", line 1016, in _bootstrap_inner
    self._target(*self._args, **self._kwargs)
TypeError: 'Series' object is not callable
    self.run()
  File "c:\Users\liu76\.conda\envs\tensorflow\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
TypeError: 'Series' object is not callable


In [19]:
TRAIN_DATA.iloc[:, [3, 4]].head(3)

Unnamed: 0_level_0,headline_tokenized,short_description_tokenized
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,petit want statu of missi elliott to replac co...,togeth we can put white supremaci down flip it...
1,thi law school creat crimin justic class base ...,long live omar code
2,mom of racist alabama student say she didn rai...,soror sister harley barber wa expel after vile...


## 建立字典並將文本轉成數字序列

可以分 4 個步驟將手上的新聞標題全部轉為數字序列：

1. 將已被斷詞的新聞標題 A 以及新聞標題 B 全部倒在一起
2. 建立一個空字典
3. 查看所有新聞標題，裏頭每出現一個字典裡頭沒有的詞彙，就為該詞彙指定一個字典裡頭還沒出現的索引數字，並將該詞彙放入字典
4. 利用建好的字典，將每個新聞標題裡頭包含的詞彙轉換成數字

In [None]:
# 將一段文字轉換成一系列的詞彙（Tokens），並為其建立字典。
# 這邊的 num_words=20000 代表我們限制字典只能包含 20,000 個詞彙
# 一旦字典達到這個大小以後，剩餘的新詞彙都會被視為 Unknown，以避免字典過於龐大。
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

In [None]:
corpus_x1 = TRAIN_DATA.headline_tokenized
corpus_x2 = TRAIN_DATA.short_description_tokenized
corpus = pd.concat([corpus_x1, corpus_x2])

pd.DataFrame(corpus.iloc[:5], columns=['headline'])

In [None]:
# 有了語料庫以後，接下來就是呼叫 tokenizer 為我們查看所有文本，並建立一個字典（步驟 2 & 3）：
tokenizer.fit_on_texts(corpus)

In [None]:
# 進行上述第 4 個步驟，請 tokenizer 利用內部生成的字典分別將我們的新聞標題 A 與 新聞 B 轉換成數字序列：
x1_train = tokenizer.texts_to_sequences(corpus_x1)
x2_train = tokenizer.texts_to_sequences(corpus_x2)

In [None]:
# 利用 tokenizer.index_word 來將索引數字對應回本來的詞彙：
for seq in x1_train[:1]:
    print([tokenizer.index_word[idx] for idx in seq])

## 序列的 Zero Padding

In [None]:
# 雖然我們已經將每個新聞標題的文本轉為一行行的數字序列，你會發現每篇標題的序列長度並不相同：
for seq in x1_train[:5]:
    print(len(seq), seq[:5], ' ...')

In [None]:
max_seq_len = max([len(seq) for seq in x1_train])
print("最長的序列甚至達到 " + str(max_seq_len) + " 個詞彙!!")

In [None]:
# 長度超過此數字的序列尾巴會被刪掉；而針對原來長度不足的序列，我們則會在詞彙前面補零。
x1_train = pad_sequences(x1_train, maxlen=MAX_SEQUENCE_LENGTH)
x2_train = pad_sequences(x2_train, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
for seq in x1_train + x2_train:
    assert len(seq) == 27
    
print("所有新聞標題的序列長度皆為 27 !")

In [None]:
x1_train[:3]

## 將正解做 One-hot Encoding (處理Label的部分)

In [None]:
# 定義每一個分類對應到的索引數字
label_to_index = {
    'BLACK VOICES':0, 
    'BUSINESS':1, 
    'WOMEN':2, 
    'WELLNESS':3, 
    'WEIRD NEWS':4, 
    'WEDDINGS':5, 
    'TRAVEL':6, 
    'THE WORLDPOST':7, 
    'TECH':8, 
    'TASTE':9,
    'STYLE & BEAUTY':10, 
    'STYLE':11, 
    'SPORTS':12, 
    'SCIENCE':13, 
    'RELIGION':14, 
    'QUEER VOICES':15, 
    'POLITICS':16, 
    'PARENTS':17, 
    'PARENTING':18, 
    'MEDIA':19,
    'IMPACT':20, 
    'HOME & LIVING':21, 
    'HEALTHY LIVING':22, 
    'GREEN':23, 
    'FOOD & DRINK':24, 
    'ENTERTAINMENT':25, 
    'DIVORCE':26, 
    'CRIME':27, 
    'COMEDY':28, 
    'WORLD NEWS':29
}

# 將分類標籤對應到剛定義的數字
y_train = TRAIN_DATA.category.apply(lambda x: label_to_index[x])
y_train = np.asarray(y_train).astype('float32')
y_train[5:]

In [None]:
y_train = tf.keras.utils.to_categorical(y_train)

## 切割訓練資料集 & 驗證資料集

In [None]:
VALIDATION_RATIO = 0.1
RANDOM_STATE = 9527

x1_train, x1_val, \
x2_train, x2_val, \
y_train, y_val = train_test_split(
        x1_train, x2_train, y_train, 
        test_size=VALIDATION_RATIO, 
        random_state=RANDOM_STATE
)

In [None]:
print("Training Set")
print("-" * 10)
print(f"x1_train: {x1_train.shape}")
print(f"x2_train: {x2_train.shape}")
print(f"y_train : {y_train.shape}")

print("-" * 10)
print(f"x1_val:   {x1_val.shape}")
print(f"x2_val:   {x2_val.shape}")
print(f"y_val :   {y_val.shape}")
print("-" * 10)
print("Test Set")

In [None]:
for i, seq in enumerate(x1_train[:5]):
    print(f"新聞標題 {i + 1}: ")
    print([tokenizer.index_word.get(idx, '') for idx in seq])
    print()

## Modeling

In [None]:
from models import Simple_LSTM, CNN_1D, GRU_model

model = Simple_LSTM(MAX_NUM_WORDS, NUM_EMBEDDING_DIM, MAX_SEQUENCE_LENGTH, NUM_LSTM_UNITS, NUM_CLASSES)

LEARNING_RATE = 0.0005
BATCH_SIZE = 256
NUM_EPOCHS = 60

checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1),
                               cooldown=0,
                               patience=5,
                               min_lr=0.5e-6)

model.summary()

In [None]:
model.compile(optimizer=Adam(lr=LEARNING_RATE),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(x=[x1_train, x2_train], 
                    y=y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=([x1_val, x2_val], y_val),
                    callbacks=[checkpoint],
                    verbose=1,
                    shuffle=True)

In [None]:
'''merged_x1 = np.vstack([x1_train, x1_val])
merged_x2 = np.vstack([x2_train, x2_val])
merged_x = np.hstack([merged_x1, merged_x2])

merged_y = np.vstack([y_train, y_val])'''

In [None]:
# merged_x.shape

In [None]:
# merged_y.shape

In [None]:
'''for kfold, (train, valid) in enumerate(KFold(n_splits=3, shuffle=True).split(merged_x, merged_y)):
    # clear the session 
    tf.keras.backend.clear_session()

    print('------------------------------------------')
    print('            Fold ' + str(kfold+1) + ' processing')
    print('------------------------------------------')
    
    # run the model 
    history = model.fit(merged_x[train], merged_y[train], 
                        batch_size=512, 
                        epochs=100, 
                        validation_data=(merged_x[valid], merged_y[valid]), 
                        callbacks=[checkpoint],
                        shuffle=True, 
                        verbose=1)'''

## Submission

In [None]:
SUBMISSION = pd.read_csv('sample_submission.csv', index_col=0)
SUBMISSION.head(3)

In [None]:
# 以下步驟分別對新聞標題 A、B　進行
# 文本斷詞 / Word Segmentation
TEST_DATA['headline_tokenized'] = TEST_DATA.loc[:, 'headline'].apply(remove_between_square_brackets, remove_special_characters)
TEST_DATA['headline_tokenized'] = TEST_DATA['headline_tokenized'].apply(simple_stemmer, remove_stopwords)
TEST_DATA['headline_tokenized'] = TEST_DATA['headline_tokenized'].apply(jieba_tokenizer)

TEST_DATA['short_description_tokenized'] = TEST_DATA.loc[:, 'short_description'].apply(remove_between_square_brackets, remove_special_characters)
TEST_DATA['short_description_tokenized'] = TEST_DATA['short_description_tokenized'].apply(simple_stemmer, remove_stopwords)
TEST_DATA['short_description_tokenized'] = TEST_DATA['short_description_tokenized'].apply(jieba_tokenizer)

# 將詞彙序列轉為索引數字的序列
x1_test = tokenizer.texts_to_sequences(TEST_DATA.headline_tokenized)
x2_test = tokenizer.texts_to_sequences(TEST_DATA.short_description_tokenized)

# 為數字序列加入 zero padding
x1_test = pad_sequences(x1_test, maxlen=MAX_SEQUENCE_LENGTH)
x2_test = pad_sequences(x2_test, maxlen=MAX_SEQUENCE_LENGTH)    

# 利用已訓練的模型做預測
merged_x_test = np.hstack([x1_test, x2_test])
predictions = model.predict(merged_x_test)

In [None]:
index_to_label = {v: k for k, v in label_to_index.items()}

TEST_DATA['category'] = [index_to_label[idx] for idx in np.argmax(predictions, axis=1)]

submission = TEST_DATA.loc[:, ['category']].reset_index()

submission.columns = ['id', 'category']
submission.to_csv('submission.csv', index=False)
submission.head()

在第一次提交結果以後，我們還可以做非常多事情來嘗試改善模型效能：

1. 改變字典詞彙量、序列長度
2. 改變詞向量的維度
3. 嘗試預先訓練的詞向量如 ELMo、GloVe
4. 調整 LSTM 層的輸出維度
5. 使用不同優化器、調整 Learning rate
6. 改變神經網路架構如使用 GRU 層
...
能改善準確度的方式不少，但因為牽涉範圍太廣，請容許我把它們留給你當做回家作業。

走到這裡代表你已經完整地經歷了一個 NLP 專案所需要的大部分步驟。在下一節．讓我們回顧一下在這趟旅程中你所學到的東西。

我們是怎麼走到這裡的
在這趟 NLP 旅程裏頭，我們學會了不少東西。

現在的你應該已經了解：

 - NLP 中常見的數據前處理以及實踐方法
 - 詞向量以及詞嵌入的基本概念
 - 神經網路常見的元件如全連接層、簡單 RNN 以及 LSTM
 - 能讀多個資料來源的孿生神經網路架構
 - 如何用 Keras 建構一個完整的神經網路
 - 深度學習 3 步驟：建模、定義衡量指標以及訓練模型
 - 梯度下降、優化器以及交叉熵等基本概念
 - 如何利用已訓練模型對新數據做預測

In [None]:
import zipfile
with zipfile.ZipFile('submission.zip', mode='w') as zf:
    zf.write("submission.csv", compress_type=zipfile.ZIP_DEFLATED)

In [None]:
#word cloud for positive review words
plt.figure(figsize=(10,10))
WC = WordCloud(width=1000 , height=500 , max_words=500 , min_font_size=5)
positive_words = WC.generate(TRAIN_DATA.to_string())
plt.imshow(positive_words, interpolation='bilinear')
plt.show