In [None]:
import sqlite3
import pandas as pd
import os

In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification,Trainer 
from sklearn.preprocessing import LabelEncoder
from transformers import TrainingArguments

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

## Object Definition

In [None]:
class TokenReader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.tokens = {}
        self.read_tokens()

    def read_tokens(self):
        try:
            with open(self.file_path, 'r') as file:
                for line in file:
                    if '=' in line:
                        key, value = line.strip().split('=')
                        self.tokens[key.strip()] = value.strip().strip("'").strip('"')
        except FileNotFoundError:
            print(f"Error: File '{self.file_path}' not found.")
            self.tokens = {}

    def get_token_value(self, token_name):
        return self.tokens.get(token_name, None)

In [None]:
class DatabaseHandler:
    def __init__(self, db_name):
        self.db_file = db_name
        self.conn = None

    def create_connection(self):
        try:
            self.conn = sqlite3.connect(self.db_file)
        except:
            print("Connection error")

    def create_table(self, create_table_query):
        try:
            c = self.conn.cursor()
            c.execute(create_table_query)
            # self.conn.commit()  # Uncomment this line if you want to commit changes immediately
        except Exception as ex:
            print(f"Create table error:{ex}")

    def add_new_row(self, table,insert_list):
        # Insert new data
        try:
            cursor = self.conn.cursor()
            str_insert_list = "'" + "','".join(insert_list) + "'"
            add_new_row_query = f"""INSERT INTO {table}(board,title)
                                   VALUES({str_insert_list})"""

            cursor.execute(add_new_row_query)
            self.conn.commit()
        except :
            print('?')
            print(insert_list)
    
    def get_data(self,query):
        query_result = pd.read_sql(query,self.conn)
        return query_result

    def close_connection(self):
        if self.conn:
            self.conn.close()



In [None]:
class PTTDataset(Dataset):
    def __init__(self, tokenizer, df) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.df = df

    def __getitem__(self, index):

        text = self.df.iloc[index]['title']
        
        # print("Dataset O label = {}".format(label))
        token = self.tokenizer(text, padding=True, truncation=True, max_length=32)
        # print(token)
        input_ids = token['input_ids']
        token_type_ids = token['token_type_ids']
        

        attention_mask = token['attention_mask']

        label = self.df.iloc[index]['label']
        label = torch.LongTensor([label])

        # print("Dataset label = {}".format(label))

        return (input_ids,token_type_ids, attention_mask, label)

    def __len__(self):
        return len(self.df)

In [None]:
def create_batch(datas):
    # print(datas)
    input_ids = [torch.Tensor(i[0]) for i in datas]
    token_ids = [torch.Tensor(i[1]) for i in datas]
    attention_mask = [torch.Tensor(i[2]) for i in datas]

    if datas[0][3] is not None:
        labels = torch.stack([i[3] for i in datas])
    else:
        labels = None

    input_ids_tensors = pad_sequence(input_ids, batch_first=True)
    token_ids_tensors = pad_sequence(token_ids, batch_first=True)
    attention_mask_tensors = pad_sequence(attention_mask, batch_first=True)

    input_ids_tensors = input_ids_tensors.to(torch.long)
    token_ids_tensors = token_ids_tensors.to(torch.long)
    attention_mask_tensors = attention_mask_tensors.to(torch.long)

    res = {
        "input_ids": input_ids_tensors,
        "token_type_ids": token_ids_tensors,
        "attention_mask": attention_mask_tensors,
        "labels": labels
    }

    return res

## Login HF_hub

In [None]:
from huggingface_hub import notebook_login
file_path = 'huggingface_token.txt'
reader = TokenReader(file_path)

HF_token_value = reader.get_token_value('HF_token')
print("Token:", HF_token_value)


notebook_login()

## Database Query

In [None]:
database_path = "Crawler\mydatabase1.db"
current_folder = os.getcwd()  # 取得目前程式碼所在的資料夾路徑
database_name = os.path.join(current_folder, '..',database_path)  # 組合路徑
print(database_name)

# 確認目標database是否存在
if os.path.exists(database_name):
    # 在這裡進行你的讀取資料夾操作
    print("database存在，可以進行讀取資料夾的操作。")
else:
    print("database不存在。")



handler = DatabaseHandler(database_name)
# create a database connection
handler.create_connection()

In [None]:
query = """SELECT board,title FROM PTT"""
df = handler.get_data(query)

## Data Overview

In [None]:
display(df.head())

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Number of post on each board
board_count={}
for i in df['board'].unique():
    board_count[i] = list(df['board']).count(i)
print(board_count)

## Cleaning

1. '\n' in title
2. drop 公告

In [None]:
# '\n'
for i in range(len(df)):
    df['title'][i] = df['title'][i].replace('\n','')

In [None]:
# 公告
for i in range(len(df)):
    if ("[公告]" in df['title'][i]):
        # print(df['title'][i])
        df = df.drop(i)
df = df.reset_index(drop=True)

In [None]:
# save dealed data

sql_create_ptt_table_query = f"""CREATE TABLE IF NOT EXISTS PTT_dealed (
                                    board text NOT NULL,
                                    title text
                                );"""

# create tables
handler.create_table(sql_create_ptt_table_query)

for _, row in df.iterrows():
    insert_list = [str(row['board']), str(row['title'])]
    handler.add_new_row('PTT_dealed',insert_list)

handler.close_connection()

## Label

In [None]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
label_encoder.fit(df['board'].unique())

label_class = list(label_encoder.classes_)
label_id = label_encoder.transform(label_class)

label2id_ = {label_: int(id_) for label_, id_ in zip(label_class, label_id)}
id2label_ = {int(id_): label_ for label_, id_ in zip(label_class, label_id)}

In [None]:
label2id_

In [None]:
id2label_

In [None]:
df['label'] = df['board'].apply(lambda x: label2id_[x])

## Model

In [None]:
from transformers import AutoModelForSequenceClassification
from shutil import ignore_patterns

tokenizer = AutoTokenizer.from_pretrained("Langboat/mengzi-bert-base")
model = AutoModelForSequenceClassification.from_pretrained(
        "Langboat/mengzi-bert-base",
        num_labels = len(df['label'].unique()),
        label2id = label2id_,
        id2label = id2label_,
        ignore_mismatched_sizes=True
    )
print(model.config.num_labels)


In [None]:
tokenizer

In [None]:
model

In [None]:
# 獲取模型的配置
config = model.config

# 查詢模型架構
print(config)

# 查詢模型的層數
num_layers = model.config.num_hidden_layers
print("模型的層數：", num_layers)

## Tokenizer

In [None]:
from sklearn.model_selection import train_test_split


samll_df = df.sample(n=1024, random_state=42)
train_df, eval_df = train_test_split(samll_df, test_size=0.5, random_state=42)

train_set = PTTDataset(tokenizer, train_df)
eval_set = PTTDataset(tokenizer,eval_df)


In [None]:
train_set[1]

## Evaluation

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Training

In [None]:
import torch
if torch.cuda.is_available():
    print("GPU is available and supports CUDA.")
    model = model.to("cuda")
    print('CUDA is available and can be used by',torch.cuda.device_count(),'device')
    print('Current_device number:',torch.cuda.current_device()) #should be zero
    print(torch.cuda.device(0))
    print("PyTorch choose the GPU (what current_device number's meaning):",torch.cuda.get_device_name(0))
else:
    print("GPU is not available or does not support CUDA.")


In [None]:
print(len(train_set))
print('iteration:',len(train_set)/8)

In [None]:
%time
import torch
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="runs/10epoch_PTT_classifier_bert-base-mengzi_model",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=eval_set,
    tokenizer=tokenizer,
    data_collator=create_batch,
    compute_metrics=compute_metrics,
)

trainer.train()



In [None]:
# Save locally
trainer.model.save_pretrained('model/10epoch_PTT_classifier_bert-base-mengzi_model')
# Push To HF
trainer.push_to_hub()

### Postmeasurement

In [None]:
text = " Re: [情報] 全家APP刮刮樂又來了"
inputs = tokenizer(text, return_tensors="pt")

In [None]:
model = model.to("cpu")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
model(**inputs)

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]