In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.chdir('/content/gdrive/My Drive/Colab/AICUP')
os.listdir()

['results',
 'logs',
 'medical_bert_ch',
 'risk_0603.ipynb',
 'data',
 'risk_train.ipynb']

In [None]:
!pip install transformers
!pip install datasets

In [None]:
import torch
print("PyTorch 的版本為: {}".format(torch.__version__))

import transformers
print("Hugging Face Transformers 的版本為: {}".format(transformers.__version__))

import datasets
print("Hugging Face Datasets 的版本為: {}".format(datasets.__version__))

PyTorch 的版本為: 1.8.1+cu101
Hugging Face Transformers 的版本為: 4.6.1
Hugging Face Datasets 的版本為: 1.8.0


In [None]:
import unicodedata
import re
import tqdm
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer
from transformers import BertForMultipleChoice, BertForSequenceClassification
from datasets import load_metric
from transformers import BertConfig,BertModel

from IPython.display import clear_output
from pathlib import Path

In [None]:
df = pd.read_csv("data/risk_df_final_sp.csv")
df = df.drop(columns = ["article_id"])
df.head()

Unnamed: 0,label,text
0,1,这个月还好这个月还好这个月还好还好还可以有性行为有是跟固定伴侣对固定伴侣对你有固定伴侣啰本来...
1,1,你会吃prep你会戴套对戴套对戴套全程对如果说全程的话也是没有那如果没有戴套是他要求还是你要...
2,1,没有很相信你固炮但我觉得是好因为保护自己因为prep本来不是吃给对方看不是一个证明文档说我有...
3,1,你这个月还好还好那有固定伴侣没有你一直以来都没有固定伴侣什么你从一开始没有固定伴侣你说来这里...
4,1,还是比较少一点偶尔戴套意思是那吃prep之后更少比偶尔还要再少一点戴套对你会觉得吃prep是...


In [None]:
df["label"].replace({"０":"0", "１":"1"}, inplace = True)
df["label"] = pd.to_numeric(df["label"],errors = "raise")
df.label.value_counts()

0    495
1    378
Name: label, dtype: int64

In [None]:
all_texts = df['text'].values.tolist()
all_labels = df['label'].values.tolist()
for i in range(len(all_texts)):
  all_texts[i] = unicodedata.normalize("NFKC", all_texts[i])

In [None]:
random_seed = 42

In [None]:
#pre_token_ch = "bert-base-chinese"
#tokenizer = transformers.AutoTokenizer.from_pretrained(pre_token_ch)
tokenizer = BertTokenizer.from_pretrained("medical_bert_ch/vocab.txt")



In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, add_special_tokens = True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, add_special_tokens = True, max_length=512)

In [None]:
class RiskDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):        
        self.encodings = encodings
        self.labels = labels
        self.len = len(encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels != None:
          item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return self.len

In [None]:
train_dataset = RiskDataset(train_encodings, train_labels)
val_dataset = RiskDataset(val_encodings, val_labels)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, auc

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    auc1 = roc_auc_score(labels, pred.predictions[:, 1])
    return {
        'accuracy': acc,
        'auc': auc1
    }

In [None]:
config = BertConfig.from_json_file("medical_bert_ch/bert_config.json")
model = BertForSequenceClassification.from_pretrained("medical_bert_ch/medical_bert_ch.bin", config=config)
#model_base = BertForSequenceClassification.from_pretrained(pre_token_ch)

Some weights of the model checkpoint at medical_bert_ch/medical_bert_ch.bin were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification

In [None]:
all_encodings = tokenizer(all_texts, truncation=True, padding=True, add_special_tokens = True, max_length=512)
all_dataset = RiskDataset(all_encodings, all_labels)

In [None]:
training_args_test = transformers.TrainingArguments(
    output_dir='./results',       
    num_train_epochs=4,           
    per_device_train_batch_size=8,
    #warmup_steps=500,              
    #weight_decay=0.01,          
    logging_dir='./logs',     
    logging_steps=20,
    seed=random_seed,
    gradient_accumulation_steps = 5,
    learning_rate = 3e-5
)

In [None]:
trainer_all = transformers.Trainer(
    model=model,                     
    args=training_args_test,         
    train_dataset=all_dataset,         
    compute_metrics=compute_metrics  
)

# 指定使用 1 個 GPU 進行訓練
trainer_all.args._n_gpu=1

# 開始進行模型訓練
trainer_all.train()

Step,Training Loss
20,0.6994
40,0.6866
60,0.6781
80,0.6286


TrainOutput(global_step=88, training_loss=0.6670994650233876, metrics={'train_runtime': 651.8869, 'train_samples_per_second': 0.135, 'total_flos': 1097084920356864.0, 'epoch': 4.0, 'init_mem_cpu_alloc_delta': 4096, 'init_mem_gpu_alloc_delta': 413439488, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 12288, 'train_mem_gpu_alloc_delta': 1292518912, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 6503093248})

In [None]:
df_test = pd.read_csv("data/risk_df_final_test_sp.csv")
df_test

Unnamed: 0.1,Unnamed: 0,article_id,label,text
0,0,1,,差不多我看眼睛什么时候能够检查年底是许国强病例是好下礼拜那我们也是照样照照常纪录好那个伤口让...
1,1,1,,波第一次是那个人不舒服是因为尿道感染没有错好谢我们下次再做我们只要吃药开处方行好你能睡好决不...
2,2,1,,间也不碰任何东西它会慢慢脱落直到有一天消失好我会写一份转诊表我给你准备数据交给那边治疗师参考...
3,3,1,,常顺便说让我们看看上次测试是在多久之前完成上个星期他会说想吃糖果不是好这次我必须回来订购评论...
4,4,2,,让我们先检查索引该死我可以检查好电脑断层以前没有拍过没有计算机断层扫描我没有拍出好照片我很热...
...,...,...,...,...
14839,15236,5312,,教不错快好药还要继续开好药不你这周不需要它好你现在每天都吃现在不算每天吃你现在吃怎么样ond...
14840,15237,5313,,那我来推荐好你住在木叶咿会不会比较好不已经有一段时间姿势会保持固定还是不昨晚我总是会你你可能...
14841,15238,5313,,都会很开心请问一整天都在做对没关系好对因为大部分c肝都是因为打针引起你hiv有验过有以前被关...
14842,15239,5313,,作还是不数字这次发烧没事咳嗽没事我们会有开对没关系你下次把你爸爸药带来我帮他看因为我建议他整...


In [None]:
test_texts = df_test['text'].values.tolist()
for i in range(len(test_texts)):
  test_texts[i] = unicodedata.normalize("NFKC", test_texts[i])
test_encodings = tokenizer(test_texts, truncation=True, padding=True, add_special_tokens = True, max_length=512)
test_dataset = RiskDataset(test_encodings, None)

In [None]:
pred = trainer_all.predict(test_dataset)
tmp_logits = pred.predictions
tmp_logits[:30]

array([[ 0.15526383, -0.63787585],
       [ 0.00157782, -0.284387  ],
       [ 0.10406069, -0.5723288 ],
       [ 0.04434576, -0.6109875 ],
       [ 0.29104483, -0.7957777 ],
       [-0.01164882, -0.09937813],
       [-0.05516176, -0.26391235],
       [ 0.51964194, -1.05224   ],
       [-0.04977325, -0.20040031],
       [-0.09051165, -0.25522968],
       [ 0.01043136, -0.06676529],
       [-0.02005613, -0.38878185],
       [ 0.00222101, -0.18366702],
       [-0.20027657,  0.037548  ],
       [ 0.26939064, -0.8583077 ],
       [ 0.36725542, -0.8526306 ],
       [ 0.1619865 , -0.6425852 ],
       [ 0.17375733, -0.62338114],
       [ 0.19013801, -0.71973085],
       [-0.00710744, -0.26847333],
       [-0.05131865, -0.35384348],
       [ 0.05606695, -0.35146636],
       [ 0.09912207, -0.22625764],
       [ 0.04774552, -0.32937464],
       [ 0.19368683, -0.740471  ],
       [ 0.32996908, -1.0242265 ],
       [ 0.30535993, -0.7359393 ],
       [ 0.01169293, -0.19497918],
       [-0.06196823,

In [None]:
myts = torch.from_numpy(tmp_logits)
m = torch.nn.Softmax(dim=1)
mynp1 = m(myts)[:,1].numpy()
mynp1[:30]

array([0.3114949 , 0.428992  , 0.3370676 , 0.3417887 , 0.2522171 ,
       0.47808173, 0.44800106, 0.17194825, 0.4624143 , 0.45891333,
       0.4807104 , 0.40884897, 0.45366132, 0.55917746, 0.2445861 ,
       0.2279565 , 0.3090484 , 0.31063798, 0.28702667, 0.435028  ,
       0.42494038, 0.39950374, 0.41936526, 0.40682164, 0.28208193,
       0.20518528, 0.2608994 , 0.44851512, 0.49492604, 0.19424188],
      dtype=float32)

In [None]:
df_test1 = df_test.drop(columns = ["Unnamed: 0","text","label"])
df_test1['probability'] = mynp1
df_test1.head()

Unnamed: 0,article_id,probability
0,1,0.311495
1,1,0.428992
2,1,0.337068
3,1,0.341789
4,2,0.252217


In [None]:
df_test2 = df_test1.groupby('article_id').mean()
df_test2.head()

Unnamed: 0_level_0,probability
article_id,Unnamed: 1_level_1
1,0.354836
2,0.392767
3,0.364425
4,0.44478
5,0.419142


In [None]:
df_test2.to_csv("data/decision.csv", index = True)