In [19]:
from huggingface_hub import notebook_login

import sqlite3
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification,Trainer 
from sklearn.preprocessing import LabelEncoder
from transformers import TrainingArguments

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy import text
import pandas as pd

from config import mysql_config  as dbconfig

from sklearn.preprocessing import LabelEncoder

from transformers import AutoModelForSequenceClassification
from shutil import ignore_patterns

from sklearn.model_selection import train_test_split

import evaluate

import numpy as np




In [20]:
class TokenReader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.tokens = {}
        self.read_tokens()

    def read_tokens(self):
        try:
            with open(self.file_path, 'r') as file:
                for line in file:
                    if '=' in line:
                        key, value = line.strip().split('=')
                        self.tokens[key.strip()] = value.strip().strip("'").strip('"')
        except FileNotFoundError:
            print(f"Error: File '{self.file_path}' not found.")
            self.tokens = {}

    def get_token_value(self, token_name):
        return self.tokens.get(token_name, None)

In [21]:

file_path = 'huggingface_token.txt'
reader = TokenReader(file_path)

HF_token_value = reader.get_token_value('HF_token')
print("Token:", HF_token_value)


notebook_login()

Error: File 'huggingface_token.txt' not found.
Token: None


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
class PTTDataset(Dataset):
    def __init__(self, tokenizer, df) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.df = df

    def __getitem__(self, index):

        text = self.df.iloc[index]['title']
        
        # print("Dataset O label = {}".format(label))
        token = self.tokenizer(text, padding=True, truncation=True, max_length=32)
        # print(token)
        input_ids = token['input_ids']
        token_type_ids = token['token_type_ids']
        

        attention_mask = token['attention_mask']

        label = self.df.iloc[index]['label']
        label = torch.LongTensor([label])

        # print("Dataset label = {}".format(label))

        return (input_ids,token_type_ids, attention_mask, label)

    def __len__(self):
        return len(self.df)

In [23]:
def create_batch(datas):
    # print(datas)
    input_ids = [torch.Tensor(i[0]) for i in datas]
    token_ids = [torch.Tensor(i[1]) for i in datas]
    attention_mask = [torch.Tensor(i[2]) for i in datas]

    if datas[0][3] is not None:
        labels = torch.stack([i[3] for i in datas])
    else:
        labels = None

    input_ids_tensors = pad_sequence(input_ids, batch_first=True)
    token_ids_tensors = pad_sequence(token_ids, batch_first=True)
    attention_mask_tensors = pad_sequence(attention_mask, batch_first=True)

    input_ids_tensors = input_ids_tensors.to(torch.long)
    token_ids_tensors = token_ids_tensors.to(torch.long)
    attention_mask_tensors = attention_mask_tensors.to(torch.long)

    res = {
        "input_ids": input_ids_tensors,
        "token_type_ids": token_ids_tensors,
        "attention_mask": attention_mask_tensors,
        "labels": labels
    }

    return res

In [84]:


user = dbconfig['user']
password = dbconfig['password']
host = dbconfig['host']
port = dbconfig['port']
database = dbconfig['database']
database_url = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}"

class DBConnecter:
    def __init__(self, database_url):
        """
        mysql+pymysql://username:password@host:port/database
        """
        self.database_url = database_url
        try:
            self.engine = create_engine(database_url)
            print("Connect to database successfully")
        except SQLAlchemyError as e:
            print(f"Fail to create database connecter engine to DB：{e}")

    def connection_info(self):
        """
        測試數據庫連接。
        """
        try:
            with self.engine.connect() as conn:
                version = conn.execute(text("SELECT VERSION();"))
                print(f"Connect to DB successfully, DB version：{version.fetchone()[0]}")
        except SQLAlchemyError as e:
            print(f"Fail to connect to DB：{e}")

    def get_insert_row_query(self, table, columns):
        column_str = ", ".join(columns)
        placeholder_str = ", ".join([f":{col}" for col in columns])
        add_new_row_query = f"INSERT INTO {table}({column_str}) VALUES({placeholder_str})"
        return add_new_row_query

    def run_no_return_query(self, query, values):
        try:
            with self.engine.connect() as conn:
                # 確保 values 是一個字典
                query = text(query)
                conn.execute(query, values)
                conn.commit() 
        except SQLAlchemyError as e:
            print(f"操作失敗：{e}")

    def run_query(self,sql):

        sql = text(sql)
        try:
            with self.engine.connect() as conn:
                query_result = pd.read_sql(sql, conn)
                return query_result # 返回查詢結果的列表
        except SQLAlchemyError as e:
            print(f"數據獲取失敗：{e}")
            return None


In [25]:
def data_transform(df):
    # '\n'
    for i in range(len(df)):
        df['title'][i] = df['title'][i].replace('\n','')

    # 公告
    for i in range(len(df)):
        if ("[公告]" in df['title'][i]):
            # print(df['title'][i])
            df = df.drop(i)
    df = df.reset_index(drop=True)
    return df

In [26]:

def label_encoder(df):

    label_encoder = LabelEncoder()
    label_encoder.fit(df['board'].unique())

    label_class = list(label_encoder.classes_)
    label_id = label_encoder.transform(label_class)

    label2id_ = {label_: int(id_) for label_, id_ in zip(label_class, label_id)}
    id2label_ = {int(id_): label_ for label_, id_ in zip(label_class, label_id)}

    return label2id_,id2label_

In [27]:
def model_config(pretain_model,df,label2id_,id2label_):


    tokenizer = AutoTokenizer.from_pretrained(pretain_model)
    model = AutoModelForSequenceClassification.from_pretrained(
            pretain_model,
            num_labels = len(df['label'].unique()),
            label2id = label2id_,
            id2label = id2label_,
            ignore_mismatched_sizes=True
        )
    print(model.config.num_labels)

    return tokenizer,model


In [28]:
def train_test_split_process(tokenizer,df):
    

    samll_df = df.sample(n=1024, random_state=42)
    train_df, eval_df = train_test_split(samll_df, test_size=0.5, random_state=42)

    train_set = PTTDataset(tokenizer, train_df)
    eval_set = PTTDataset(tokenizer,eval_df)

    return train_set, eval_set


In [29]:

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [30]:
%time
def training_and_save_model_processing(model,train_arg_set,train_set,eval_set,tokenizer):
    
    if torch.cuda.is_available():
        print("GPU is available and supports CUDA.")
        model = model.to("cuda")
        print('CUDA is available and can be used by',torch.cuda.device_count(),'device')
        print('Current_device number:',torch.cuda.current_device()) #should be zero
        print(torch.cuda.device(0))
        print("PyTorch choose the GPU (what current_device number's meaning):",torch.cuda.get_device_name(0))
    else:
        print("GPU is not available or does not support CUDA.")

    


    training_args = TrainingArguments(
        output_dir=train_arg_set['output_dir'],
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=train_arg_set['per_device_train_batch_size'],
        per_device_eval_batch_size=train_arg_set['per_device_eval_batch_size'],
        num_train_epochs=train_arg_set['num_train_epochs'],
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=True,
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=eval_set,
        tokenizer=tokenizer,
        data_collator=create_batch,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save locally
    # trainer.model.save_pretrained('model/10epoch_PTT_classifier_bert-base-mengzi_model')
    # Push To HF
    trainer.push_to_hub()

CPU times: total: 0 ns
Wall time: 0 ns


In [86]:


if __name__ == "__main__":
    
    db_connector = DBConnecter(database_url)
    data = db_connector.run_query(f"""SELECT board,title FROM nlp.PTT""")
    display(data)

    df = data_transform(data)

    sql_create_ptt_table_query = f"""CREATE TABLE IF NOT EXISTS PTT_dealed (
                                    board text NOT NULL,
                                    title text
                                );"""
    db_connector.run_query(sql_create_ptt_table_query)

    for _, row in df.iterrows():
        insert_dict = {
        "board": str(row['board']),
        "title": str(row['title'])
                            }
        columns = ['board', 'title']
        add_new_row_query = db_connector.get_insert_row_query('PTT_dealed', columns)
        db_connector.run_no_return_query(add_new_row_query, insert_dict)

    label2id_,id2label_ = label_encoder(df)
    df['label'] = df['board'].apply(lambda x: label2id_[x])

    pretain_model = "Langboat/mengzi-bert-base"
    tokenizer,model = model_config(pretain_model,df,label2id_,id2label_)

    train_set, eval_set = train_test_split_process(tokenizer,df)
    train_arg_set={
        'output_dir':"runs/10epoch_PTT_classifier_bert-base-mengzi_model",
        'num_train_epochs':10,
        'per_device_train_batch_size':8,
        'per_device_eval_batch_size':8,
    }
    training_and_save_model_processing(model,train_arg_set,train_set,eval_set,tokenizer)

    

Connect to database successfully


Unnamed: 0,board,title
0,NBA,\n[花邊] 到1/21為止所有球員TPA排行圖\n
1,NBA,\n[公告] 板規10.1\n
2,NBA,\n[情報] SEASON Schedule January 22–23\n
3,NBA,\n[公告] 板主徵選開始\n
4,NBA,"\n[情報] NBA Standings (Jan. 22, 2023)\n"
...,...,...
2996,Lifeismoney,\n[情報] 大魯閣用icashPay滿$500贈10%op\n
2997,Lifeismoney,\n[情報] === 元月全台捐血贈品 === (1/26更新)\n
2998,Lifeismoney,\n[公告] 板規(113.1.15修訂)暨違規公告區\n
2999,Lifeismoney,\n[公告] 贈送集中文\n


數據獲取失敗：This result object does not return rows. It has been closed automatically.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Langboat/mengzi-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5
GPU is available and supports CUDA.
CUDA is available and can be used by 1 device
Current_device number: 0
<torch.cuda.device object at 0x000001E62480D450>
PyTorch choose the GPU (what current_device number's meaning): NVIDIA GeForce RTX 2050


d:\ZProject\NLP\Text_Categorizer_for_Chinese_Community\Train\runs/10epoch_PTT_classifier_bert-base-mengzi_model is already a clone of https://huggingface.co/youchengChung/10epoch_PTT_classifier_bert-base-mengzi_model. Make sure you pull the latest changes with `repo.git_pull()`.


  0%|          | 0/640 [00:00<?, ?it/s]

KeyboardInterrupt: 