In [None]:
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
device

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def read_dataset(path):
    df = pd.read_csv(path)
    df['text'] = df['text'].astype(str)
    df['text'] = df['text'].str.replace("\n", "").str.replace("\r", "")
    df = df.dropna()
    return df

df = read_dataset("weibo_senti_100k.csv")
df.head()

Unnamed: 0,label,text
0,1,﻿更博了，爆照了，帅的呀，就是越来越爱你！生快傻缺[爱你][爱你][爱你]
1,1,@张晓鹏jonathan 土耳其的事要认真对待[哈哈]，否则直接开除。@丁丁看世界 很是细心...
2,1,姑娘都羡慕你呢…还有招财猫高兴……//@爱在蔓延-JC:[哈哈]小学徒一枚，等着明天见您呢/...
3,1,美~~~~~[爱你]
4,1,梦想有多大，舞台就有多大![鼓掌]


In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

In [None]:
from models.bert import BERTSentiment

In [None]:
MAX_LEN = 128
BATCH_SIZE = 4
EPOCHS = 3
LR = 2e-5

class SentimentDataset(Dataset):
    def __init__(self, path, tokenizer):
        df = pd.read_csv(path)
        df['text'] = df['text'].astype(str)
        # df['text'] = df['text'].str.replace("\n", "").str.replace("\r", "")
        df = df.dropna()
        self.data = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.loc[idx, "text"])
        label = int(self.data.loc[idx, "label"])

        encoding = self.tokenizer(
            text,
            max_length=MAX_LEN,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label)
        }

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
train_dataset = SentimentDataset("weibo_senti_100k.csv", tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

model = BERTSentiment(num_labels=3).to(device)
optimizer = AdamW(model.parameters(), lr=LR)

from train import train

if __name__ == "__main__":
    train(model, train_loader, optimizer, device, EPOCHS)

Epoch 1:   2%|▏         | 180/7500 [23:59<16:15:45,  8.00s/it, loss=0.0344] 


KeyboardInterrupt: 