In [1]:
!pip install transformers
import time
import torch
import math
import numpy
from transformers import BertTokenizer
from transformers import logging
from IPython.display import clear_output
from transformers import BertForMaskedLM
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)


Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 6.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 3.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 19.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [2]:
from google.colab import drive
drive.mount('/content/drive')
df_train = pd.read_csv('/content/drive/MyDrive/CS579_project/train.csv')

Mounted at /content/drive


In [3]:
from scipy.sparse.construct import rand

# delete row with long title
# Because long title will 
MAX_LENGTH = 150
df_train = df_train[~(df_train.title1_en.apply(lambda x : len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_en.apply(lambda x : len(x)) > MAX_LENGTH)]

# 250 thousands training datas are too large to spend a lot of time 
# So I select 70 percents of datas to train the model
SAMPLE_FRAC = 0.5
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9527)

df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_en', 'title2_en', 'label']]



# save processed training data to csv file
df_train.to_csv("train.csv", sep=",", index=False)



In [4]:
import random
import pandas

In [5]:
df_len = len(df_train)
print(df_len)
split = 0.7
inde = math.floor(df_len * split)

df_train_train = df_train.iloc[:inde, :] 
df_train_val = df_train.iloc[inde+1:, : ] 
print(len(df_train_train))
print(len(df_train_val))
df_train_train.to_csv("df_train_train.csv", sep=",", index=False)
df_train_val.to_csv("df_train_val.csv", sep=",", index=False)




121870
85309
36560


In [6]:
type(df_train_train)

pandas.core.frame.DataFrame

In [7]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.  
    
    device = torch.device('cuda')    


    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [8]:
class fake_news(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["df_train_train", "test","df_train_val"] 
        self.mode = mode
        # generate train or test csv file
        self.df = pd.read_csv(mode + ".csv")
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer  

    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            title1_en, title2_en = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            title1_en, title2_en, label = self.df.iloc[idx, :].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)

        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(title1_en)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)

        # 第二個句子的 BERT tokens
        tokens_b = self.tokenizer.tokenize(title2_en)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a

        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)

        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b,
                                       dtype=torch.long)     

        return (tokens_tensor, segments_tensor, label_tensor)

    def __len__(self):
        return self.len


In [9]:
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = fake_news("df_train_train", tokenizer=tokenizer)
validset = fake_news("df_train_val", tokenizer=tokenizer)

sample_idx = 0

# 將原始文本拿出做比較
text_a, text_b, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)





# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors

In [10]:
print(trainset[0])

(tensor([  101,  1302,  2187,  1165,  1106, 19722,  1103,  3738,  3705,  9630,
          117,  1103,  9230,  1910,  1148,  3349,  1106,  1138,  1292,  1160,
        21487,  1363,   106,   102,   153, 14517,  2861,  2053,  1976,  1106,
         1267,  1191,  1175,  1132,  1160,  4802,  1107,  1103,  1313,   117,
         1191,  1128,  1169,  3670,   170,  1415,  2971,  1104,  1948,   106,
          102]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1]), tensor(2))


In [11]:
def collate_fn(samples):
    
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    

    # make sure each tensor has same length which is maximum(512)
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)

    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [12]:
print(len(trainset))

85309


In [13]:

trainloader = DataLoader(trainset, batch_size=64,collate_fn=collate_fn)
validloader = DataLoader(validset, batch_size=64,collate_fn=collate_fn)

In [14]:
print(trainloader.dataset[0])
print(len(trainloader))
print(validloader.dataset[0])
print(len(validloader))

(tensor([  101,  1302,  2187,  1165,  1106, 19722,  1103,  3738,  3705,  9630,
          117,  1103,  9230,  1910,  1148,  3349,  1106,  1138,  1292,  1160,
        21487,  1363,   106,   102,   153, 14517,  2861,  2053,  1976,  1106,
         1267,  1191,  1175,  1132,  1160,  4802,  1107,  1103,  1313,   117,
         1191,  1128,  1169,  3670,   170,  1415,  2971,  1104,  1948,   106,
          102]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1]), tensor(2))
1333
(tensor([  101,  8040,  5773,  3991,   106,  1975,   107,  8388,   107,  1111,
        23673, 24118,  7972,   117,  1106,  1103,  1244,  1311,  7809,   131,
        10602,  1121,  1103,  1244,  1311,  2319,   106,   102, 20164,  7220,
         6851,  2593,  1106,  1646,  1433,  4449, 12287,   131,  6014,  2371,
          102]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:


PRETRAINED_MODEL_NAME = "bert-base-cased"
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()


In [16]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    i =0
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors,
                            token_type_ids=segments_tensors,
                            attention_mask=masks_tensors)

            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
          
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()

            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))


    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [17]:
# get the accurancy of training set
model = model.to(device)
print("device:", device)
_, acc = get_predictions(model, validloader, compute_acc=True)
print("classification acc:", acc)

start = time.time()

# activate training mode 
model.train()

# initialize optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

EPOCHS = 4
for epoch in range(EPOCHS):

    running_loss = 0.0
    for data in trainloader:
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]
        optimizer.zero_grad()

        # forward pass
        outputs = model(input_ids=tokens_tensors,
                        token_type_ids=segments_tensors,
                        attention_mask=masks_tensors,
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

    # 計算分類準確率
    _, acc = get_predictions(model, validloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

print("The time used to execute this is given below")

end = time.time()

print(end - start)

device: cuda
classification acc: 0.6464442013129102
[epoch 1] loss: 576.179, acc: 0.828
[epoch 2] loss: 424.305, acc: 0.833
[epoch 3] loss: 315.273, acc: 0.834
[epoch 4] loss: 230.502, acc: 0.833
The time used to execute this is given below
9172.241270780563


In [18]:
from google.colab import drive
drive.mount('/content/drive')
df_test = pd.read_csv('/content/drive/MyDrive/CS579_project/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
MAX_LENGTH = 150
df_test = df_test[~(df_test.title1_en.apply(lambda x : len(x)) > MAX_LENGTH)]
df_test = df_test[~(df_test.title2_en.apply(lambda x : len(x)) > MAX_LENGTH)]
df_test = df_test.reset_index()
df_test = df_test.loc[:, ['title1_en', 'title2_en','id']]
df_test.to_csv("test.csv", sep=",", index=False)


In [21]:


testset = fake_news("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=50, 
                        collate_fn=collate_fn)

predictions = get_predictions(model, testloader)

# transform the label to the words we can understand
index_map = {v: k for k, v in testset.label_map.items()}

# produce the result file
df = pd.DataFrame({"Category": predictions.tolist()})
df['Category'] = df.Category.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["id"]], 
                          df.loc[:, 'Category']], axis=1)
df_pred.to_csv('/content/drive/MyDrive/CS579_project/result.csv', index=False)
df_pred.head()

Unnamed: 0,id,Category
0,256442,unrelated
1,256443,unrelated
2,256444,unrelated
3,256445,unrelated
4,256446,unrelated
