In [None]:
import random
import gc

from tqdm import tqdm
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
!pip install transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AdamW

gc.collect()

In [None]:
!python --version
print(" ")
print(torch.__version__)
print(" ")
!nvcc --version

In [None]:
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

# Prepare data

In [None]:
!apt-get install unzip
!unzip ../input/sentiment-analysis-on-movie-reviews/test.tsv.zip test.tsv
!unzip ../input/sentiment-analysis-on-movie-reviews/train.tsv.zip train.tsv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
sample_submission = pd.read_csv('/content/drive/MyDrive/sentiment-analysis-on-movie-reviews/sampleSubmission.csv')

train_df = pd.read_csv('/content/drive/MyDrive/sentiment-analysis-on-movie-reviews/train.tsv', sep='\t')
print(train_df.shape)
print(train_df.info())
train_df.head()

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/sentiment-analysis-on-movie-reviews/test.tsv', sep='\t')
print(test_df.shape)
print(test_df.info())
test_df.head()

# Text Processing

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class MovieReviewsDataset(Dataset):
    def __init__(self, df, max_len, test_only=False):
        self.max_len = max_len
        self.test_only = test_only
        self.text = df['Phrase'].tolist()
        if not self.test_only:
            self.sentiments = df['Sentiment'].values
            
        self.encode = tokenizer.batch_encode_plus(
            self.text,
            padding='max_length',
            max_length=self.max_len,
            truncation=True,
            return_attention_mask=True
        )
        
    def __getitem__(self, i):
        input_ids = torch.tensor(self.encode['input_ids'][i])
        attention_mask = torch.tensor(self.encode['attention_mask'][i])
        
        if self.test_only:
            return (input_ids, attention_mask)
        else:
            sentiments = self.sentiments[i]
            return (input_ids, attention_mask, sentiments)
    
    def __len__(self):
        return len(self.text)


In [None]:
max_len = 64
train_dataset = MovieReviewsDataset(train_df, max_len)
test_dataset = MovieReviewsDataset(test_df, max_len, test_only=True)

lengths = [int(len(train_dataset) * 0.8), int(len(train_dataset) * 0.2)]
train_dataset, valid_dataset = random_split(train_dataset, lengths=lengths, generator=torch.Generator().manual_seed(42))

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(valid_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)

# Modeling

In [None]:
class Model(nn.Module):#构建模型
    def __init__(self):
        super(Model, self).__init__()
        #从hugging face提供的api引入预训练模型BERT
        bert_base_config = AutoConfig.from_pretrained('bert-base-uncased')
        self.bert_base = AutoModel.from_pretrained('bert-base-uncased', config=bert_base_config)
        self.classifier = nn.Linear(bert_base_config.hidden_size, 5)
    #定义前向函数
    def forward(self, input_ids, attention_mask):
        bert_base_output = self.bert_base(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = bert_base_output[1] # [batch_size, hidden] 
        out = self.classifier(pooler_output)
        return out

model = Model()
model.to(device)
optimizer = AdamW(model.parameters(), lr = 2e-5)
criteron=nn.CrossEntropyLoss()#定义损失函数

In [None]:

gc.collect()

In [None]:
#模型训练
total_loss = []
total_val_acc = []
for epoch in range(100):#100个epoch
    model.train()
    epoch_loss = []
    for input_ids, attention_mask, target in tqdm(train_dataloader):#转换数据类型
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)            
        target = target.to(device)
        
        optimizer.zero_grad()#每个epoch中初始化梯度为0
        
        y_pred = model(input_ids, attention_mask)
        
        loss = criteron(y_pred, target)#计算loss
        loss.backward()
        optimizer.step()
        
        epoch_loss.append(loss.item())

    input_ids = input_ids.to(torch.device('cpu'))
    attention_mask = attention_mask.to(torch.device('cpu'))            
    target = target.to(torch.device('cpu'))
    gc.collect()

    val_accs = []
    model.eval()
    for input_ids, attention_mask, target in tqdm(val_dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)        
        y_pred = model(input_ids, attention_mask)
        _, y_pred = torch.max(y_pred, -1)
        acc = torch.mean((torch.tensor(y_pred.cpu() == target.cpu(), dtype=torch.float)))
        val_accs.append(acc.cpu())

    el = sum(epoch_loss)/len(epoch_loss)
    total_loss.append(el)
    acc = np.array(val_accs).mean()
    total_val_acc.append(acc)
    print("Epoch:", epoch+1, "-- loss:", el, "-- acc:", acc)