# 安裝 & 設定

## 下載Bert的中文模型

## 安裝繁簡轉換工具

## 測試繁簡轉換

In [1]:
from opencc import OpenCC
cc = OpenCC('s2t')
cc.convert('上海2010上半年四六级考试报名4月8日前完成')

'上海2010上半年四六級考試報名4月8日前完成'

# 建立 BERT Tokenizer

In [2]:
import tensorflow as tf
from transformers import BertTokenizer

2021-10-29 17:01:35.484755: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-29 17:01:35.484795: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [4]:
tokenizer.tokenize("它的特性是難以專注、過度活躍、做事不考慮後果等等。除此之外，還有不合年紀的行為")

['它',
 '的',
 '特',
 '性',
 '是',
 '難',
 '以',
 '專',
 '注',
 '、',
 '過',
 '度',
 '活',
 '躍',
 '、',
 '做',
 '事',
 '不',
 '考',
 '慮',
 '後',
 '果',
 '等',
 '等',
 '。',
 '除',
 '此',
 '之',
 '外',
 '，',
 '還',
 '有',
 '不',
 '合',
 '年',
 '紀',
 '的',
 '行',
 '為']

In [5]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("它的特性是難以專注、過度活躍、做事不考慮後果等等。除此之外，還有不合年紀的行為"))

[2124,
 4638,
 4294,
 2595,
 3221,
 7432,
 809,
 2201,
 3800,
 510,
 6882,
 2428,
 3833,
 6713,
 510,
 976,
 752,
 679,
 5440,
 2719,
 2527,
 3362,
 5023,
 5023,
 511,
 7370,
 3634,
 722,
 1912,
 8024,
 6917,
 3300,
 679,
 1394,
 2399,
 5145,
 4638,
 6121,
 4158]

In [6]:
def tokenize_text(text_input:str): # 自動轉換繁體之後轉成ID
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(cc.convert(text_input)))

# 讀取Data 

In [7]:
import pandas as pd

In [8]:
df_raw = pd.read_csv('toutiao_cat_data.txt',sep='_!_',header=None,names=['1','2','label','text','3'])

  return func(*args, **kwargs)


In [9]:
dataset = df_raw[['text','label']]

In [10]:
dataset.isnull().values.any()

False

In [11]:
dataset.shape

(382688, 2)

In [12]:
dataset = dataset.sample(10000)
text = dataset['text'].values
label = dataset['label'].values

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()

In [15]:
le.fit(label)

LabelEncoder()

In [16]:
y = le.transform(label)

In [17]:
text[10]

'索天科技2017年营收7519万元 净赚1363万元'

In [18]:
tokenize_text(text[10])

[5164,
 1921,
 4906,
 2825,
 8109,
 2399,
 4245,
 3119,
 8273,
 8818,
 5857,
 1039,
 3912,
 6553,
 9015,
 8152,
 5857,
 1039]

In [19]:
label[10]

'news_finance'

In [20]:
y[10]

5

In [21]:
tokenized_text = [tokenize_text(i) for i in text]

In [22]:
len(tokenized_text)

10000

In [23]:
text_label_len = [[text,y[i],len(text)] for i, text in enumerate(tokenized_text)]

In [24]:
import random
random.shuffle(text_label_len)

In [25]:
text_label_len.sort(key=lambda x: x[2])

In [26]:
sorted_text_label = [(comps[0],comps[1]) for comps in text_label_len]

# 建立TFDataset

In [27]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_text_label, output_types=(tf.int32, tf.int32))

In [28]:
BATCH_SIZE = 128
max_len = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((max_len, ), ()))

In [29]:
next(iter(batched_dataset))

(<tf.Tensor: shape=(128, 32), dtype=int32, numpy=
 array([[2961, 7030,  100, ...,    0,    0,    0],
        [5847, 6965, 2119, ...,    0,    0,    0],
        [3984, 2336, 1920, ...,    0,    0,    0],
        ...,
        [3582, 1690,  782, ...,    0,    0,    0],
        [6306, 5543, 2802, ...,    0,    0,    0],
        [1066, 4089, 1927, ...,    0,    0,    0]], dtype=int32)>,
 <tf.Tensor: shape=(128,), dtype=int32, numpy=
 array([ 1,  3,  3,  9,  2,  2, 12,  5, 13, 11,  2,  2,  3, 12,  2,  6, 12,
         2,  2, 12,  3,  1, 13, 11, 12,  1, 12,  5, 11,  1, 10,  3,  5, 12,
         8,  0,  6,  5,  3, 12,  2,  0,  8,  9,  2, 11,  7,  7,  3,  8,  2,
         8, 11,  8,  3, 13, 13,  3,  0,  2,  2,  1, 13,  3, 13, 12,  0,  4,
        11, 13,  3, 12,  3, 13,  0,  0,  6, 13,  0,  3,  1, 11,  1, 12, 11,
         3,  1, 12,  0,  0, 11,  0,  3, 11, 11, 11,  9,  2,  2,  5,  2,  2,
         2,  6,  4,  3, 11, 11,  1, 11,  8,  2,  2, 12,  5,  3, 11,  8,  5,
         3, 11, 13,  4, 11, 11, 11, 

## 製作訓練測試

In [30]:
import math
TOTAL_BATCHES = math.ceil(len(sorted_text_label) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [31]:
from transformers import TFBertForSequenceClassification

In [32]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-chinese')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit(train_data, epochs=5)

Epoch 1/5
      1/Unknown - 47s 47s/step - loss: 55.3801 - accuracy: 0.0547

In [None]:
model.evaluate(test_data)