In [1]:
from datasets import load_dataset
import numpy as np
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'].to_pandas()
validation_dataset = dataset['validation'].to_pandas()
test_dataset = dataset['test'].to_pandas()
max_len=max(0,train_dataset["text"].apply(lambda x:len(x)).max())
max_len=max(max_len,validation_dataset["text"].apply(lambda x:len(x)).max())
max_len=max(max_len,test_dataset["text"].apply(lambda x:len(x)).max())
max_len+=5


In [2]:
import nltk

def prep_pretrained_embedding():
    def build_vocab(train_dataset):
        # Create set, unique words only
        vocab = set()
        train_dataset_pos = []
        
        # Loop thru each sentence in training dataset
        for sentence in train_dataset['text']:
            # Basic text processing
            
            # Case folding
            sentence = sentence.lower()
            
            # NLTK tokenizer does a good job at separating meaningful words + punctuations
            # Better than defining regex ourselves
            word_list = nltk.tokenize.word_tokenize(sentence)
            
            # # Further split words into separate words
            # # e.g., 'well-being' -> 'well', 'being'
            # # e.g., 'music/song' -> 'music', 'song'
            # split_word_list = []
            # for word in sentence_list:
            #     split_word_list.extend(word.replace('-', ' ').replace('/', ' ').split())
            
            # Dont remove all special characters, some are meaningful
            # Some words are surrounded by single/double quotes
            word_list = [word.strip("'\"") for word in word_list]
            
            # Add into set
            vocab.update(word_list)
            
            # Get pos tags
            # Also build POS tags
            pos_tags = nltk.pos_tag(word_list)
            train_dataset_pos.append(pos_tags)
            
        vocab.discard('')
        return vocab, train_dataset_pos

    vocab, train_dataset_pos = build_vocab(train_dataset)



    def load_glove_embeddings(path):
        glove_embeddings = {}
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float64')
                glove_embeddings[word] = vector
                
        return glove_embeddings

    glove_embeddings = load_glove_embeddings('glove.6B.50d.txt')
    vocab_word_to_index = {word: idx for idx, word in enumerate(vocab)}

    def create_embedding_matrix(word_to_index, glove_embeddings):
        # Initialize embedding matrix with zeros
        # 50d
        embedding_matrix = np.zeros((len(vocab)+2, 50), dtype='float64')
        
        # Loop thru each word in vocab
        for word, idx in word_to_index.items():
            # Check if word exists in glove embeddings
            if word in glove_embeddings:
                # Copy glove embedding to embedding matrix
                embedding_matrix[idx] = glove_embeddings[word]
                # If OOV, assign None first
                
        return embedding_matrix

    embedding_matrix = create_embedding_matrix(vocab_word_to_index, glove_embeddings)
    #handle <unk>
    embedding_matrix[-2]=[ 0.01513297,  0.2400952 , -0.13676383,  0.13166569, -0.28283166,
        0.10421129,  0.39747017,  0.07944959,  0.29670785,  0.05400998,
        0.48425894,  0.26516231, -0.48021244, -0.25129253, -0.24367068,
       -0.24188322,  0.47579495, -0.2097357 , -0.02568224, -0.31143999,
       -0.3196337 ,  0.44878632, -0.07379564,  0.32765833, -0.49052161,
       -0.33455611, -0.34772199, -0.05043562, -0.0898296 ,  0.04898804,
        0.4993778 ,  0.04359836,  0.40077601, -0.31343237,  0.24126281,
       -0.4907152 , -0.20372591, -0.32123346, -0.39554707,  0.37386547,
        0.44720326,  0.45492689, -0.16420979,  0.42844699,  0.15748723,
       -0.23547929, -0.33962153,  0.04243802, -0.03647524, -0.0042893 ]
    
    return vocab_word_to_index,embedding_matrix


In [3]:
import pickle

def prep_embedding(handle_oov=False):
    if handle_oov:
        with open('embedding_matrix.pkl', 'rb') as file:  
            embedding_matrix = pickle.load(file)
            embedding_matrix = np.concatenate((embedding_matrix, np.zeros((1, 50))), axis=0)
        with open('vocab_word_to_index.pkl', 'rb') as file:  
            vocab_word_to_index = pickle.load(file)
            del vocab_word_to_index['<UNK>']
    else:
        vocab_word_to_index,embedding_matrix= prep_pretrained_embedding()
        embedding_matrix[-1]=np.zeros(50)
    # print(embedding_matrix)
    # print(embedding_matrix.shape)
    # print(len(vocab_word_to_index))
    # print(vocab_word_to_index)
    return vocab_word_to_index,embedding_matrix

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset

device=torch.device('cuda')

class CustomedDataset(Dataset):
    def __init__(self,sentences,labels,vocab_word_to_index):
        self.features=torch.tensor([[vocab_word_to_index[word] if word in vocab_word_to_index else len(vocab_word_to_index) for word in sentence]+[len(vocab_word_to_index)+1]*(max_len-len(sentence)) for sentence in sentences]).to(device)
        self.labels=torch.tensor(labels).to(device)
    
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self,idx):
        return self.features[idx],self.labels[idx]

def prep_dataloader(train_dataset,validation_dataset,test_dataset,batch_size,vocab_word_to_index):
    train_dataloader=DataLoader(CustomedDataset(train_dataset["text"],train_dataset["label"],vocab_word_to_index),batch_size=batch_size,shuffle=True)
    validation_dataloader=DataLoader(CustomedDataset(validation_dataset["text"],validation_dataset["label"],vocab_word_to_index),batch_size=batch_size)
    test_dataloader=DataLoader(CustomedDataset(test_dataset["text"],test_dataset["label"],vocab_word_to_index),batch_size=batch_size)
    return train_dataloader,validation_dataloader,test_dataloader
    

In [5]:


class CNNTextClassifier(nn.Module):
    def __init__(self, embedding_matrix, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        embedding_matrix=torch.tensor(embedding_matrix,dtype=torch.float32)
        # print(embedding_matrix)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        # self.embedding = embedding_matrix
        # print(self.embedding.shape)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, n_filters, (fs, embedding_matrix.shape[1])) for fs in filter_sizes]
        )
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.softmax=nn.Softmax(-1)

    def forward(self, sentences):
        # text = [batch size, sent len]
        # embedded=[[self.embedding[idx] for idx in sentence] for sentence in sentences]
        embedded = self.embedding(sentences)  # embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)  # embedded = [batch size, 1, sent len, emb dim]
        # print(embedded)
        # print(embedded.shape)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]  # conv_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]  # pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))  # cat = [batch size, n_filters * len(filter_sizes)]
        
        return self.softmax(self.fc(cat))


In [6]:
class ResidualBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc=nn.Linear(input_dim,output_dim)
        self.relu=nn.ReLU()

    def forward(self, x):
        return self.relu(x+self.fc(x))

class CNNTextResidualClassifier(nn.Module):
    def __init__(self, embedding_matrix, n_filters, filter_sizes, output_dim, dropout,num_hidden=256,res_block_num=3):
        super().__init__()
        embedding_matrix=torch.tensor(embedding_matrix,dtype=torch.float32)
        # print(embedding_matrix)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        # self.embedding = embedding_matrix
        # print(self.embedding.shape)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, n_filters, (fs, embedding_matrix.shape[1])) for fs in filter_sizes]
        )
        self.fc = nn.Linear(len(filter_sizes) * n_filters, num_hidden)
        self.relu=nn.ReLU()
        self.res_block=nn.Sequential(*[ResidualBlock(num_hidden,num_hidden) for _ in range(res_block_num)])
        self.fc_out=nn.Linear(num_hidden,output_dim)
        self.dropout = nn.Dropout(dropout)
        self.softmax=nn.Softmax(-1)

    def forward(self, sentences):
        # text = [batch size, sent len]
        # embedded=[[self.embedding[idx] for idx in sentence] for sentence in sentences]
        embedded = self.embedding(sentences)  # embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)  # embedded = [batch size, 1, sent len, emb dim]
        # print(embedded)
        # print(embedded.shape)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]  # conv_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]  # pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))  # cat = [batch size, n_filters * len(filter_sizes)]
        res_block_in=self.relu(self.fc(cat))
        res_block_out=self.res_block(res_block_in)
        return self.softmax(self.fc_out(res_block_out))

In [35]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # 创建一个大的位置编码矩阵
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        # 将位置编码作为常量注册到模型中，不需要梯度更新
        self.register_buffer('pe', pe)

    def forward(self, x):
        # print(x.shape)
        # print(self.pe.shape)
        x = x + self.pe[:,:x.shape[1], :].requires_grad_(False)
        return self.dropout(x)

def get_key_padding_mask(tokens,vocab_size):
    key_padding_mask=torch.zeros(tokens.size())
    key_padding_mask[tokens==vocab_size-1]=-torch.inf
    return key_padding_mask.to(device)

class TransformerModel(nn.Module):
    def __init__(self, embedding_matrix, max_len, dropout, num_hidden=64, num_resblock=3, nhead=5, num_encoder_layers=2):
        super(TransformerModel, self).__init__()
        embedding_matrix=torch.tensor(embedding_matrix,dtype=torch.float32)
        d_model=embedding_matrix.shape[1]
        self.embedding_src = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.positional_encoding = PositionalEncoding(d_model,max_len=max_len)  # 最大序列长度为1650
        encoder = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,dim_feedforward=128)
        self.transformer = nn.TransformerEncoder(encoder,num_encoder_layers)
        self.fc1 = nn.Linear(d_model, num_hidden)
        self.res_blocks1=nn.Sequential(*[ResidualBlock(num_hidden,num_hidden) for _ in range(num_resblock)])
        self.fc2 = nn.Linear(num_hidden*max_len,num_hidden)
        self.res_blocks2=nn.Sequential(*[ResidualBlock(num_hidden,num_hidden) for _ in range(num_resblock)])
        self.fc3 = nn.Linear(num_hidden,2)
        self.softmax=nn.Softmax(-1)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(dropout)

    def forward(self, src):
        src_key_padding_mask = get_key_padding_mask(src,self.embedding_src.num_embeddings)
        src_emb = self.positional_encoding(self.embedding_src(src)*torch.sqrt(torch.tensor(self.embedding_src.embedding_dim, dtype=torch.float32)))
        # Transformer forward with attention masks
        output = self.transformer(
            src_emb.permute(1, 0, 2), 
            src_key_padding_mask=src_key_padding_mask,
        ) #[token_num,batch_size,embedding_dim]
        # pooled_output=nn.MaxPool2d(,)
        output=self.dropout(self.relu(self.fc1(output.permute(1,0,2))))
        output=self.res_blocks1(output)
        output=self.dropout(self.relu(self.fc2(output.reshape(output.shape[0],-1))))
        output=self.res_blocks2(output)
        return self.softmax(self.fc3(output))
    

In [8]:


def train(model,optimizer,criterion,num_epoch,train_dataloader,validation_dataloader):
    from tqdm import tqdm
    model.to(device)
    for _ in range(num_epoch):
        acc_loss=0
        model.train()
        process_bar=tqdm(train_dataloader,desc=f"Epoch {_}/{num_epoch}",leave=True)
        for features,labels in process_bar:
            
            pred=model(features)
            # print(pred,pred.shape)
            # print(labels,labels.shape)
            optimizer.zero_grad()
            loss=criterion(pred,labels)
            loss.backward()
            optimizer.step()
            
            acc_loss+=loss.item()
            process_bar.set_postfix_str(f"Mean loss: {acc_loss/(process_bar.n+1)}")
        
        print("Train loss:",acc_loss/process_bar.n)
        
        acc_loss=0
        model.eval()
        with torch.no_grad():
            acc_loss=0
            process_bar=tqdm(validation_dataloader,desc="Validating",leave=True)
            for features,labels in process_bar:
                
                pred=model(features)
                
                loss=criterion(pred,labels)
                
                acc_loss+=loss.item()
                process_bar.set_postfix_str(f"Mean loss: {acc_loss/(process_bar.n+1)}")
                
            print("Validation loss:",acc_loss/process_bar.n)



In [37]:
def work_flow(model_type,handle_oov,params):
    vocab_word_to_index,embedding_matrix=prep_embedding(handle_oov)
    # print(embedding_matrix)
    train_dataloader,validation_dataloader,test_dataloader=prep_dataloader(train_dataset,validation_dataset,test_dataset,params["batch_size"],vocab_word_to_index)

    if model_type=="CNN":
        model = CNNTextClassifier(embedding_matrix, params["n_filters"], params["filter_sizes"], params["output_dim"], params["dropout"])
    if model_type=="CNN_res_block":
        model = CNNTextResidualClassifier(embedding_matrix, params["n_filters"], params["filter_sizes"], params["output_dim"], params["dropout"])
    if model_type=="transformer":
        model = TransformerModel(embedding_matrix,max_len,params["dropout"])

    criterion=nn.CrossEntropyLoss()
    optimizer=torch.optim.Adam(model.parameters(),lr=params["lr"])
    
    train(model,optimizer,criterion,params["num_epoch"],train_dataloader,validation_dataloader)
    
    model.eval()
    test_acc=0
    tot_samples=0
    with torch.no_grad():
        for features,labels in test_dataloader:
            pred_labels=model(features)
            # print(labels.shape,pred_labels.shape)
            test_acc+=(labels==pred_labels.argmax(dim=1)).sum().item()
            tot_samples+=labels.shape[0]
        print(f"Test acc is:{test_acc/tot_samples*100}%")

params={"batch_size":256,"n_filters":32,"filter_sizes":[1,2,3,5],"output_dim":2,"dropout":0.2,"lr":0.0002,"num_epoch":10}
work_flow("transformer",True,params)

Epoch 0/10: 100%|██████████| 34/34 [00:45<00:00,  1.35s/it, Mean loss: 0.6972135673550999]


Train loss: 0.6972135673550999


Validating: 100%|██████████| 5/5 [00:00<00:00,  6.71it/s, Mean loss: 0.691911256313324] 


Validation loss: 0.691911256313324


Epoch 1/10: 100%|██████████| 34/34 [00:48<00:00,  1.43s/it, Mean loss: 0.6935540648067698]


Train loss: 0.6935540648067698


Validating: 100%|██████████| 5/5 [00:01<00:00,  3.96it/s, Mean loss: 0.6913015604019165]


Validation loss: 0.6913015604019165


Epoch 2/10: 100%|██████████| 34/34 [00:45<00:00,  1.33s/it, Mean loss: 0.6931360258775598]


Train loss: 0.6931360258775598


Validating: 100%|██████████| 5/5 [00:00<00:00, 11.13it/s, Mean loss: 0.6892780542373658]


Validation loss: 0.6892780542373658


Epoch 3/10: 100%|██████████| 34/34 [00:27<00:00,  1.23it/s, Mean loss: 0.6908555206130532]


Train loss: 0.6908555206130532


Validating: 100%|██████████| 5/5 [00:00<00:00,  6.41it/s, Mean loss: 0.6819021940231323]


Validation loss: 0.6819021940231323


Epoch 4/10: 100%|██████████| 34/34 [00:38<00:00,  1.14s/it, Mean loss: 0.6854179589187398]


Train loss: 0.6854179589187398


Validating: 100%|██████████| 5/5 [00:00<00:00,  7.21it/s, Mean loss: 0.6750262379646301]


Validation loss: 0.6750262379646301


Epoch 5/10: 100%|██████████| 34/34 [00:34<00:00,  1.02s/it, Mean loss: 0.6845513003713944]


Train loss: 0.6845513003713944


Validating: 100%|██████████| 5/5 [00:00<00:00,  6.38it/s, Mean loss: 0.6825941324234008]


Validation loss: 0.6825941324234008


Epoch 6/10: 100%|██████████| 34/34 [00:41<00:00,  1.21s/it, Mean loss: 0.6761371160254759]


Train loss: 0.6761371160254759


Validating: 100%|██████████| 5/5 [00:00<00:00,  6.95it/s, Mean loss: 0.6957069039344788]


Validation loss: 0.6957069039344788


Epoch 7/10: 100%|██████████| 34/34 [00:47<00:00,  1.39s/it, Mean loss: 0.6745475407908944]


Train loss: 0.6745475407908944


Validating: 100%|██████████| 5/5 [00:00<00:00,  6.44it/s, Mean loss: 0.6814854383468628]


Validation loss: 0.6814854383468628


Epoch 8/10: 100%|██████████| 34/34 [00:34<00:00,  1.01s/it, Mean loss: 0.6640392489293042]


Train loss: 0.6640392489293042


Validating: 100%|██████████| 5/5 [00:00<00:00,  8.93it/s, Mean loss: 0.7158849596977234]


Validation loss: 0.7158849596977234


Epoch 9/10: 100%|██████████| 34/34 [00:26<00:00,  1.28it/s, Mean loss: 0.6571861365262199]


Train loss: 0.6571861365262199


Validating: 100%|██████████| 5/5 [00:00<00:00,  9.02it/s, Mean loss: 0.868243858218193] 


Validation loss: 0.6945950865745545
Test acc is:54.87804878048781%
