# 下載訓練資料(這邊使用 wiki Data)
## 主要以 pages-articles.xml.bz2 結尾之檔案類型
* 維基資料集: https://zh.wikipedia.org/wiki/Wikipedia:%E6%95%B0%E6%8D%AE%E5%BA%93%E4%B8%8B%E8%BD%BD
* 這邊以　[zhwiki-latest-pages-meta-current.xml.bz2　為訓練集](https://dumps.wikimedia.org/zhwiki/latest/)。

# 將 Wiki 資料轉為 txt(請先下載 zhwiki-latest-pages-meta-current.xml.bz2 至同一個目錄下)

In [1]:
# -*- coding: utf-8 -*-
__author__ = "ALEX-CHUN-YU (P76064538@mail.ncku.edu.tw)"
import logging
import sys
import warnings
warnings.filterwarnings(action ='ignore', category = UserWarning, module = 'gensim')
from gensim.corpora import WikiCorpus

# 將 wiki 資料集載下後進行 xml convert to txt 
class Wiki_to_txt(object):

    def __init__(self):
        # 用默認 Formatter 為日誌系統建立一個 StreamHandler ，設置基礎配置並加到 root logger 中
        logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

    # 使用方法 https://radimrehurek.com/gensim/corpora/wikicorpus.html
    def set_wiki_to_txt(self, wiki_data_path = None):
        if wiki_data_path == None:
            # 系統下參數
            if len(sys.argv) != 2:
                print("Please Usage: python3 " + sys.argv[0] + " wiki_data_path")
                exit()
            else:
                wiki_corpus = WikiCorpus(sys.argv[1], dictionary = {})
        else:
            wiki_corpus = WikiCorpus(wiki_data_path, dictionary = {})
        # wiki.xml convert to wiki.txt
        with open("wiki_text.txt", 'w', encoding = 'utf-8') as output:
            text_count = 0
            for text in wiki_corpus.get_texts():
                # save use string(gensim)
                output.write(' '.join(text) + '\n')
                text_count += 1
                if text_count % 10000 == 0:
                    logging.info("目前已處理 %d 篇文章" % text_count)
            print("轉檔完畢!")
            
if __name__ == "__main__":
    wiki_to_txt = Wiki_to_txt()
    # 將 wiki xml 轉換成 wiki txt
    wiki_to_txt.set_wiki_to_txt("zhwiki-latest-pages-meta-current.xml.bz2")


ModuleNotFoundError: No module named 'gensim'

# 簡體轉繁體 -> 進行斷詞 -> 過濾[停用詞](https://github.com/Alex-CHUN-YU/Word2vec/blob/master/stopwords.txt)

In [None]:
# -*- coding: utf-8 -*-
__author__ = "ALEX-CHUN-YU (P76064538@mail.ncku.edu.tw)"
import jieba
import logging
from hanziconv import HanziConv

# 進行斷詞並過濾 stopword
class Segmentation(object):

    def __init__(self):
        # 用默認 Formatter 為日誌系統建立一個 StreamHandler ，設置基礎配置並加到 root logger 中
        logging.basicConfig(format = "%(asctime)s : %(levelname)s : %(message)s", level = logging.INFO)
        self.stopwordset = set()
        
    # 讀取 stopword 辭典，並存到 stopwordset
    def set_stopword(self):
        with open("stopwords.txt", "r", encoding = "utf-8") as stopwords:
            for stopword in stopwords:
                self.stopwordset.add(stopword.strip('\n'))
        #print(self.stopwordset)
        print("StopWord Set 已儲存!")

    # 簡 to 繁
    def simplified_to_traditional(self):
        logging.info("等待中..(簡 to 繁)")
        traditional = open("traditional.txt", "w", encoding = "utf-8")
        with open("wiki_text.txt", "r", encoding = "utf-8") as simplified:
            for s in simplified:
                traditional.write(HanziConv.toTraditional(s))
        print("成功簡體轉繁體!")
        traditional.close()

    # 斷詞(Segmentation)並過濾掉停用詞(Stop Word)
    def segmentation(self):
        logging.info("等待中..(jieba 斷詞，並過濾停用詞)")
        segmentation = open("segmentation.txt", "w", encoding = "utf-8")
        with open("traditional.txt", "r", encoding = "utf-8") as Corpus:
            for sentence in Corpus:
                sentence = sentence.strip("\n")
                pos = jieba.cut(sentence, cut_all = False)
                for term in pos:
                    if term not in self.stopwordset:
                        segmentation.write(term + " ")
        print("jieba 斷詞完畢，並已完成過濾停用詞!")
        segmentation.close()

if __name__ == "__main__":
    segmentation = Segmentation()
    # 讀取停用詞辭典
    segmentation.set_stopword()
    # data 進行簡體轉繁體
    segmentation.simplified_to_traditional()
    # 進行 jieba 斷詞同步過濾停用詞，並產生辭典
    segmentation.segmentation()

: 

# 透過 Gensim Word2Vec 來進行訓練(這邊使用 Skip-gram 模型，Dimension 設為 300 維度)

In [None]:
# -*- coding: utf-8 -*-
__author__ = "ALEX-CHUN-YU (P76064538@mail.ncku.edu.tw)"
import warnings
warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')
from gensim.models import word2vec

# 主要透過 gensim 訓練成 model 並供使用
class Train(object):

    def __init__(self):
        pass

    # 可參考 https://radimrehurek.com/gensim/models/word2vec.html 更多運用
    def train(self):
        print("訓練中...(喝個咖啡吧^0^)")
        # Load file
        sentence = word2vec.Text8Corpus("segmentation.txt")
        # Setting degree and Produce Model(Train)
        model = word2vec.Word2Vec(sentence, size = 300, window = 10, min_count = 5, workers = 4, sg = 1)
        # Save model 
        model.wv.save_word2vec_format(u"wiki300.model.bin", binary = True)
        print("model 已儲存完畢")

if __name__ == "__main__":
    t = Train()
    # 訓練(shallow semantic space)
    t.train()


: 

# 將訓練好的 Model 進行測試

In [None]:
# -*- coding: utf-8 -*-
__author__ = "ALEX-CHUN-YU (P76064538@mail.ncku.edu.tw)"
import warnings
warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')
from gensim.models.keyedvectors import KeyedVectors

# 載入 model 並去運用
def main():
    # 可參考 https://radimrehurek.com/gensim/models/word2vec.html 更多運用
    # How to use bin(model)?
    word_vectors = KeyedVectors.load_word2vec_format("wiki300.model.bin", binary = True)
    print("'爸爸'前10名相似:")    
    res = word_vectors.wv.most_similar('爸爸', topn = 10)
    for item in res:
        print(item[0] + "," + str(item[1]))
    print("\n'爸爸','媽媽'之間相似度:")
    res = word_vectors.similarity('爸爸', '媽媽')
    print(res)
    print("\n'爸爸'之於'老公',如'媽媽'之於'老婆':")
    res = word_vectors.most_similar(positive = ['爸爸', '老公'], negative = ['媽媽'], topn = 5)
    for item in res:
        print(item[0] + "," + str(item[1]))

if __name__ == "__main__":
    main()


: 