In [11]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
import torch


model_name = './bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at ./bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint 

In [12]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)
    
    #  text = self.texts[idx]
        # label = self.labels[idx]
        # encoding = tokenizer(text, truncation=True, padding=True, return_tensors='pt', max_length=128)
        # return {'input_ids': encoding['input_ids'].flatten(),
        #         'attention_mask': encoding['attention_mask'].flatten(),
        #         'labels': torch.tensor(label, dtype=torch.long)}

    def __getitem__(self, idx):
        # encoding = self.tokenizer(, return_tensors='pt', padding=True, truncation=True, max_length=200)
        
        encoding = self.tokenizer(self.texts[idx],padding = 'max_length',truncation = True,max_length = 128,return_tensors='pt')  
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx])
        }
        return item


In [13]:
import pandas as pd

pd_all = pd.read_csv('weibo_senti_100k.csv')
moods = {0: '负向', 1: '正向'}

# pd_all = pd.read_csv('simplifyweibo_4_moods.csv')
# moods = {0: '喜悦', 1: '愤怒', 2: '厌恶', 3: '低落'}

print('微博数目（总体）：%d' % pd_all.shape[0])

for label, mood in moods.items(): 
    print('微博数目（{}): {}'.format(mood,  pd_all[pd_all.label==label].shape[0]))

s = pd_all.sample(1000)
texts = [item[1] for item in s.values]
labels = [item[0] for item in s.values]



dataset = CustomDataset(texts, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)


微博数目（总体）：119988
微博数目（负向): 59995
微博数目（正向): 59993


In [14]:
optimizer = AdamW(model.parameters(), lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()


In [15]:
num_epochs = 5  # 选择训练的轮数
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(epoch)

    # 在每个epoch结束后可以加入验证集上的评估代码


0
1
2
3


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'bert_sentiment_model.pth')
# 加载模型
model.load_state_dict(torch.load('bert_sentiment_model.pth'))


<All keys matched successfully>

In [11]:
text_to_predict = "很好"
encoding = tokenizer(text_to_predict, return_tensors='pt', padding=True, truncation=True)
output = model(**encoding)
predicted_label = torch.argmax(output.logits, dim=1).item()
print(predicted_label)


1


In [16]:
import torch

# 示例 PyTorch 张量
tensor_a = torch.tensor([1, 2, 3])
tensor_b = torch.tensor([4, 5, 6])

# 将 PyTorch 张量转换为 Python 列表
list_a = tensor_a.tolist()
list_b = tensor_b.tolist()

# 将两个列表拼接在一起
combined_list = list_a + list_b

print(combined_list)


[1, 2, 3, 4, 5, 6]


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm


device = torch.device("cpu")


model_name = './bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
model = BertForSequenceClassification.from_pretrained('./bert-param')


class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)
    
    #  text = self.texts[idx]
        # label = self.labels[idx]
        # encoding = tokenizer(text, truncation=True, padding=True, return_tensors='pt', max_length=128)
        # return {'input_ids': encoding['input_ids'].flatten(),
        #         'attention_mask': encoding['attention_mask'].flatten(),
        #         'labels': torch.tensor(label, dtype=torch.long)}

    def __getitem__(self, idx):
        # encoding = self.tokenizer(, return_tensors='pt', padding=True, truncation=True, max_length=200)
        
        encoding = self.tokenizer(self.texts[idx],padding = 'max_length',truncation = True,max_length = 128,return_tensors='pt')  
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx])
        }
        return item
    
    
def test(model, testloader):
    model.eval()
    y = []
    y_pred = []
    for batch in tqdm(iter(testloader)):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        output = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
        predicted_label = torch.argmax(output.logits, dim=1).tolist()
        
        y += labels.tolist()
        y_pred += predicted_label
    
    accuracy = accuracy_score(y, y_pred)
    print(f"Accuracy: {accuracy}")
    print(classification_report(y, y_pred))

        
        
    

def train(model, dataloader, testloader, num_epochs=10, lr=1e-5):    
    model.train()  
    optimizer = AdamW(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        for batch in tqdm(iter(dataloader)):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
            loss = outputs.loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print('epoch {}'.format(epoch+1))
        test(model, testloader)
        torch.save(model.state_dict(), 'bert_sentiment_model.pth')
            
        

    


import pandas as pd

pd_all = pd.read_csv('./weibo_senti_100k.csv')
moods = {0: '负向', 1: '正向'}

# pd_all = pd.read_csv('nlp/simplifyweibo_4_moods.csv')
# moods = {0: '喜悦', 1: '愤怒', 2: '厌恶', 3: '低落'}

print('微博数目（总体）：%d' % pd_all.shape[0])

for label, mood in moods.items(): 
    print('微博数目（{}): {}'.format(mood,  pd_all[pd_all.label==label].shape[0]))

s = pd_all.sample(10000)
texts = [item[1] for item in s.values]
labels = [item[0] for item in s.values]

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


dataset = CustomDataset(X_train, y_train, tokenizer)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)


test_dataset = CustomDataset(X_test, y_test, tokenizer)
testloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

train(model, dataloader, testloader)




微博数目（总体）：119988
微博数目（负向): 59995
微博数目（正向): 59993


100%|██████████| 63/63 [25:01<00:00, 23.83s/it]


epoch 1


100%|██████████| 16/16 [02:09<00:00,  8.11s/it]


Accuracy: 0.974
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1002
           1       0.98      0.97      0.97       998

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000



  0%|          | 0/63 [00:06<?, ?it/s]


KeyboardInterrupt: 

In [6]:
model.save_pretrained("./bert-param")