# 安裝 & 設定

## 安裝繁簡轉換工具

## 測試繁簡轉換

In [None]:
from opencc import OpenCC
cc = OpenCC('s2t')
cc.convert('上海2010上半年四六级考试报名4月8日前完成')

# 建立 BERT Tokenizer

In [None]:
import tensorflow as tf
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [None]:
tokenizer.tokenize("它的特性是難以專注、過度活躍、做事不考慮後果等等。除此之外，還有不合年紀的行為")

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("它的特性是難以專注、過度活躍、做事不考慮後果等等。除此之外，還有不合年紀的行為"))

In [None]:
def tokenize_text(text_input:str): # 自動轉換繁體之後轉成ID
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(cc.convert(text_input)))

# 讀取Data 

In [None]:
import pandas as pd

In [None]:
df_raw = pd.read_csv('toutiao_cat_data.txt',sep='_!_',header=None,names=['1','2','label','text','3'])

In [None]:
dataset = df_raw[['text','label']]

In [None]:
dataset.isnull().values.any()

In [None]:
dataset.shape

In [None]:
dataset = dataset.sample(10000)
text = dataset['text'].values
label = dataset['label'].values

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
le.fit(label)

In [None]:
y = le.transform(label)

In [None]:
text[10]

In [None]:
tokenize_text(text[10])

In [None]:
label[10]

In [None]:
y[10]

In [None]:
tokenized_text = [tokenize_text(i) for i in text]

In [None]:
len(tokenized_text)

In [None]:
text_label_len = [[text,y[i],len(text)] for i, text in enumerate(tokenized_text)]

In [None]:
import random
random.shuffle(text_label_len)

In [None]:
text_label_len.sort(key=lambda x: x[2])

In [None]:
sorted_text_label = [(comps[0],comps[1]) for comps in text_label_len]

# 建立TFDataset

In [None]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_text_label, output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 128
max_len = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((max_len, ), ()))

In [None]:
next(iter(batched_dataset))

## 製作訓練測試

In [None]:
import math
TOTAL_BATCHES = math.ceil(len(sorted_text_label) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [None]:
from transformers import TFBertForSequenceClassification

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-chinese')

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit(train_data, epochs=5)

In [None]:
model.evaluate(test_data)