In [1]:
import gc
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

In [2]:
df = pd.read_csv("C:/Users/cheng/Desktop/vscode/AIintro/AI-FInal-Project-main/Dataset/Re_labeled_dataset.csv")
df.head()

Unnamed: 0,description,predicted_category
0,how's your sex life?by not sharing your real t...,"['ARTS', 'DIVORCE', 'ENTERTAINMENT', 'HEALTHY ..."
1,how to keep your child safe (and happy) online...,"['PARENTING', 'PARENTS']"
2,ground beef recipes that go beyond burgersthin...,"['FOOD & DRINK', 'TASTE']"
3,why there are no winners in the battle of brea...,"['PARENTING', 'PARENTS']"
4,the duchess of cambridge's black lace dress is...,"['ARTS & CULTURE', 'ENTERTAINMENT', 'MEDIA', '..."


In [3]:
#找出所有分類，並重新編排，若有該分類呈現1沒有為0

In [4]:

category_list = list(sorted(set(cate for cate_list in df['predicted_category'] for cate in eval(cate_list))))
for category in category_list:
    df[category] = df['predicted_category'].apply(lambda x: 1 if category in x else 0)
df.drop(columns = ['predicted_category'], inplace = True)
df.to_csv("C:/Users/cheng/Desktop/vscode/AIintro/AI-FInal-Project-main/Dataset/Re_labeled_dataset2.csv", index=False)
df = df.sample(frac=0.5, random_state=15)
df.head()

Unnamed: 0,description,ARTS,ARTS & CULTURE,BLACK VOICES,BUSINESS,COLLEGE,COMEDY,CRIME,CULTURE & ARTS,DIVORCE,...,TASTE,TECH,TRAVEL,U.S. NEWS,WEDDINGS,WEIRD NEWS,WELLNESS,WOMEN,WORLD NEWS,WORLDPOST
13648,"spock was my guyspock was my guy: calm, ration...",0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66815,"isis is worse than genghis khan, says top iraq...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
54762,de-man cave or un-man cave? that is the questi...,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,1,1,1,1,1
43756,jennifer lawrence reportedly laughs off critic...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21234,selig counted money while baseball lost the ne...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
train_df, test_df= train_test_split(df, random_state = 15, test_size = 0.3)
train_df = train_df[train_df['description'].notna()]
test_df = test_df[test_df['description'].notna()]



In [6]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 3e-5

In [7]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['description']
        self.max_len = max_len
        self.targets = self.df[category_list].values
    
    def __len__(self): return len(self.title)
    
    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }



In [9]:
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=15)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(valid_df, tokenizer, MAX_LEN)




In [10]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

valid_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)


In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')



In [12]:
##建置模型

In [13]:
gc.collect()
torch.cuda.empty_cache()
class build_model(torch.nn.Module):
    def __init__(self):
        super(build_model, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 41)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = build_model()
model.to(device)

build_model(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [14]:
loss_function = torch.nn.BCEWithLogitsLoss
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
valid_targets, valid_outputs = [], []

In [15]:

def train_model(n_epochs, training_loader, validation_loader, model, optimizer):
   
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs + 1):
        gc.collect()
        torch.cuda.empty_cache()
        train_loss = 0
        valid_loss = 0

        model.train()
        print('############# Epoch {}: Training Start   #############'.format(epoch))
        
        batch_idx = 1
        for data in tqdm(training_loader, desc=f"Epoch {epoch} Training"):
            batch_idx += 1
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = loss_function()(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss = train_loss + ((1 / batch_idx) * (loss.item() - train_loss))

        print('############# Epoch {}: Training End     #############'.format(epoch))

        print('############# Epoch {}: Validation Start   #############'.format(epoch))
        
        model.eval()
        
        valid_targets = []
        valid_outputs = []
        with torch.no_grad():
            batch_idx = 0
            for data in tqdm(validation_loader, desc=f"Epoch {epoch} Validation"):
                batch_idx += 1
                ids = data['input_ids'].to(device, dtype=torch.long)
                mask = data['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
                targets = data['targets'].to(device, dtype=torch.float)
                outputs = model(ids, mask, token_type_ids)

                loss = loss_function()(outputs, targets)
                valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
                valid_targets.extend(targets.cpu().detach().numpy().tolist())
                valid_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        print('############# Epoch {}: Validation End     #############'.format(epoch))
        
        # calculate average losses
        train_loss = train_loss / len(training_loader)
        valid_loss = valid_loss / len(validation_loader)
        
        # print training/validation statistics 
        print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
        ))

        print('############# Epoch {}  Done   #############\n'.format(epoch))

    return model

In [16]:
trained_model = train_model(EPOCHS, train_data_loader, valid_data_loader, model, optimizer)


############# Epoch 1: Training Start   #############


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch 1 Training: 100%|██████████| 2063/2063 [1:00:55<00:00,  1.77s/it]


############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############


Epoch 1 Validation: 100%|██████████| 230/230 [02:44<00:00,  1.40it/s]


############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000129 	Average Validation Loss: 0.000964
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############


Epoch 2 Training: 100%|██████████| 2063/2063 [1:28:31<00:00,  2.57s/it]


############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############


Epoch 2 Validation: 100%|██████████| 230/230 [05:53<00:00,  1.54s/it]

############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000100 	Average Validation Loss: 0.000904
############# Epoch 2  Done   #############






In [27]:
accuracy = 0
threshold = 0.5
#torch.save(trained_model.state_dict(), "C:/Users/cheng/Desktop/vscode/AIintro/AI-FInal-Project-main/Model/trained_model_state.pt")
for i in tqdm(range(len(test_df)), desc=f'Test:'):
    data = test_df['description'][i]
    true_list = set([cate for cate in category_list if test_df.loc[i][cate] == 1])
    encodings = tokenizer.encode_plus(
        data,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    model.eval()
    with torch.no_grad():
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        output = model(input_ids, attention_mask, token_type_ids)
        final_output = torch.sigmoid(output).cpu().detach().numpy()
        
        pred = train_df.columns[1:][np.where(final_output[0] > threshold)[0]]
        
        accuracy += len(true_list.intersection(pred)) / len(true_list.union(pred))
print('accurancy {}'.format(accuracy / len(test_df)))

Test::   1%|          | 136/15714 [00:13<26:39,  9.74it/s]


KeyboardInterrupt: 