# Data Preprocessing

對新聞、討論的文章進行資料前處理，包含斷詞、去除標點符號、計算 tf-idf、特徵選取等等，以利後續的模型訓練。

In [12]:
# imports
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from scipy import sparse

## 載入 CKIP Tagger 的相關函數

若要使用 CKIP Tagger 進行斷詞，需要先安裝 CKIP Tagger 的 Python 套件，並且載入相關函數。

請參考 https://github.com/ckiplab/ckiptagger 的說明，安裝 CKIP Tagger 的 Python 套件。


In [5]:
# 先把我們需要的函數載入
ws = WS("./data_ckip") # 斷詞
pos = POS("./data_ckip") # 詞性標注
ner = NER("./data_ckip") # 命名實體識別 

  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
2024-04-14 14:53:17.514234: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)


## 載入資料集

In [15]:
def word_segmentation(contents, ws):
    word_sentence_list = ws(contents, 
                        sentence_segmentation=True,
                        segment_delimiter_set={'?', '？', '!', '！', '。', ',','，', ';', ':', '、', ' ', '.'})
    # 標點符號
    punc = ['，', '。', '、', '：', '；', '？', '！', '「', '」', '（', '）', '『', '』', '—', '－', '～', '…', '‧', '《', '》', '〈', '〉', '﹏﹏']
    eng_punc = [',', '.', ':', ';', '?', '!', '(', ')', '[', ']', '&', '@', '#', '$', '%', '-', '_', '*', '/', '\\', '+', '=', '>', '<', '"', "'", '’', '‘', '“', '”', ' ']
    # 停用詞
    stop_words = ['全文', '日', '月', '年', 'br', '中央社', '公司', '上午', '下午', '日期']

    word_sentence_list = [[word for word in sentence if not any(char.isdigit() for char in word)] for sentence in word_sentence_list]
    word_sentence_list = [[word for word in sentence if word not in punc] for sentence in word_sentence_list]
    word_sentence_list = [[word for word in sentence if word not in eng_punc] for sentence in word_sentence_list]
    word_sentence_list = [[word for word in sentence if word not in stop_words] for sentence in word_sentence_list]

    return word_sentence_list


In [14]:
def preprocessing(df, days, ws):
    df = df[df['label_day' + str(days)] != -1] # 把標籤為 -1 的 row 全部移除
    Y = lambda df, days: df['label_day' + str(days)].tolist() # 提取標籤
    contents = df['content'].tolist() # 提取正文內容
    # 對正文內容進行斷詞
    word_sentence_list = word_segmentation(contents, ws)
    # 取得 1-gram - 3-gram 的 tf-idf 特徵
    tv = TfidfVectorizer(ngram_range=(1, 3))
    tfidf = tv.fit_transform([' '.join(sentence) for sentence in word_sentence_list])
    # 取得前 1000 個特徵
    ch2 = SelectKBest(chi2, k=1000)
    X = ch2.fit_transform(tfidf, Y(df, days))
    # 將 foreign_investor_surplus, investment_trust_surplus, dealer_surplus 加入到 X
    X = sparse.hstack((X, sparse.csr_matrix(df[['foreign_investor_surplus', 'investment_trust_surplus', 'dealer_surplus']])))
    return X, Y(df, days)

In [16]:
df_news = pd.read_csv('../data/news_filtered_merged.csv') # 讀取新聞資料
X, Y = preprocessing(df_news, 1, ws) # 對新聞資料進行前處理
print(X.shape) # (新聞數, 特徵數)
print(len(Y)) # 標籤數

(4890, 1003)
4890


In [18]:
# Save X and Y
sparse.save_npz('../data/X.npz', X)
sparse.save_npz('../data/Y.npz', sparse.csr_matrix(Y))