In [1]:
import os
import csv

import pandas as pd
from sklearn.utils import shuffle

In [19]:
def flat_accuracy(preds, labels):
    """A function for calculating accuracy scores"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return (np.sum( pred_flat == labels_flat))

In [2]:
def data_preprocess1(dataset_path,csv_path):
    fcsv = open(csv_path,'w',encoding='utf-8',newline='')
    csv_writer = csv.writer(fcsv)
    # 3. 构建列表头
    csv_writer.writerow(["Text","Labels"])
    with open(dataset_path, 'r', encoding='utf-8') as f:
        reader=csv.reader(f)
        ln = -1
        for line in (reader):
            ln = ln+1
            if ln==0:
                continue
            text = line[1]
            label = line[2]
            text_a = text.split("__eou__")
            #print(text_a)
            talk_num = len(text_a)
            bias = len(label)
            if talk_num == bias:
                for i in range(talk_num):
                    csv_writer.writerow([text_a[i],label[i]])
                
            
def data_preprocess2(dataset_path,csv_path):
    vcsv = open(csv_path,'w',encoding='utf-8',newline='')
    tcsv = open(r"./test_1.csv",'w',encoding='utf-8',newline='')
    csv_w1 = csv.writer(vcsv)
    csv_w2 = csv.writer(tcsv)
    csv_w1.writerow(["Text","Labels"])
    csv_w2.writerow(["ID","Last Label"])
    with open(dataset_path, 'r', encoding='utf-8') as f:
        readpre=csv.reader(f)
        ln = 0
        for line in (readpre):
            if ln==0:
                continue
            pre_text = line[1]
            pre_label = line[2]
            break
    with open(dataset_path, 'r', encoding='utf-8') as f:
        reader=csv.reader(f)
        ln = -1
        for line in (reader):
            ln = ln+1
            if ln==0:
                continue
            text = line[1]
            label = line[2]
            text_a = text.split("__eou__")
            talk_num = len(text_a)
            bias = len(label)
            if bias+1 == talk_num:
                for i in range(talk_num-1):
                    csv_w1.writerow([text_a[i],label[i]])
                csv_w2.writerow([text_a[talk_num-1],1])
                pre_text = text
                pre_label = label
            else:
                text_a = pre_text.split("__eou__")
                talk_num = len(text_a)
                for i in range(talk_num-1):
                    csv_w1.writerow([text_a[i],pre_label[i]])
                csv_w2.writerow([text_a[talk_num-1],1])
                
            

In [3]:
from transformers import BertTokenizer, BertConfig, AdamW, BertForSequenceClassification
from transformers import RobertaTokenizer,  RobertaForSequenceClassification
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import csv
import random
from transformers import get_linear_schedule_with_warmup
import torch
import os
import torch.nn as nn
batch_size = 16
learning_rate = 2e-5
bert_path = "./bert-base-chinese"

def set_seed(seed=7):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def read_data(path, tokenizers, max_len):
    input_ids = []
    input_types = []
    input_masks = []
    input_labels = []
    ln = -1
    right = 0
    wrong = 0
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)       
        for line in (reader):
            ln += 1
            if ln == 0:
                continue
            label = int(line[1])
            text = '[CLS]' + line[0]
            text = tokenizers.tokenize(text)
            ids = tokenizers.convert_tokens_to_ids(text)
            types = [0] * len(ids)
            masks = [1] * len(ids)
            if len(ids) <max_len:
                types = types + [1]*(max_len - len(ids))
                masks = masks + [0] * (max_len - len(ids))
                ids = ids + [0] * (max_len - len(ids))
            else:
                types = types[:max_len]
                masks = masks[:max_len]
                ids = ids[:max_len]
            wrong +=1
            assert len(ids) == len(masks) == len(types) == max_len
            right +=1
            input_masks.append(masks)
            input_types.append(types)
            input_ids.append(ids)
            input_labels.append(label)
    print(right,wrong)
    input_ids = torch.tensor([i for i in input_ids], dtype=torch.long)
    attention_mask = torch.tensor([i for i in input_masks], dtype=torch.long)
    token_type_ids = torch.tensor([i for i in input_types], dtype=torch.long)
    label_ids = torch.tensor([i for i in input_labels], dtype=torch.long)
    data = TensorDataset(input_ids, attention_mask, token_type_ids, label_ids)
    return data, len(input_ids)

In [20]:
def step1(path, epoch, max_len):
    set_seed(32)
    tokenizers = BertTokenizer.from_pretrained(bert_path, cache_dir=None)
    #tokenizers = RobertaTokenizer.from_pretrained(bert_path)
    train_data, _ = read_data(path+'train.csv', tokenizers, max_len)
    dev_data, dev_len = read_data(path + 'valid.csv', tokenizers, max_len)
    test_data, test_len = read_data(path + 'test_1.csv', tokenizers, max_len)
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    dev_dataloader = DataLoader(dev_data, batch_size=batch_size, shuffle=True)
    #test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
    config = BertConfig.from_pretrained(bert_path, num_labels=7)

    model = BertForSequenceClassification.from_pretrained(bert_path,config = config)
    #model = RobertaForSequenceClassification.from_pretrained(bert_path, return_dict=True)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * epoch
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    best_res = 0.0
    best_model = model
    for i in range(epoch):
        model.train()
        total_loss, total_val_loss = 0, 0
        total_eval_accuracy = 0
        for j, batch in enumerate(train_dataloader):
            model.zero_grad()
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            label_ids = batch[3].to(device)
            outputs = model(input_ids = input_ids,
                           attention_mask = attention_mask,
                           token_type_ids = token_type_ids ,
                           labels = label_ids)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            #if j% 100 == 0:
                #print(i, j, '\t', loss)
        #confusion = torch.zeros(2, 2)
        model.eval()
        for k, batch in enumerate(dev_dataloader):
            with torch.no_grad():
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                token_type_ids = batch[2].to(device)
                label_ids = batch[3].to(device)
                outputs = model(input_ids=input_ids,
                                     attention_mask=attention_mask,
                                     token_type_ids=token_type_ids,
                                     labels=label_ids)
                loss = outputs.loss
                logits = outputs.logits
                total_val_loss += loss.item()
                logits = logits.detach().cpu().numpy()
                label_ids = label_ids .to('cpu').numpy()
                pred = np.argmax(logits, axis=1).flatten()
                total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_train_loss = total_loss / len(train_dataloader)
        avg_val_loss = total_val_loss / len(dev_dataloader)
        avg_val_accuracy = total_eval_accuracy / dev_len
        #f1 = flat_f1(confusion)
        if avg_val_accuracy > best_res:
            best_model = model
    best_model.eval()
    add_data = []
    for i, batch in enumerate(test_dataloader):
        with torch.no_grad():
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            label_ids = batch[3].to(device)
            outputs = best_model(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids,
                                 labels=label_ids)

            loss = outputs.loss
            logits = outputs.logits
            total_test_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            total_test_accuracy += flat_accuracy(logits, label_ids)
            pred = np.argmax(logits, axis=1).flatten()            
            for m in range(len(pred)):
                add_data.append(pred[m])
    former = pd.read_csv(r'./text2.csv') 
    former['Last Label'] = pred
    data.to_csv(r'./test_2.csv',mode='a',index=True)

In [None]:
path = "./"
max_len = 128
epoch = 5
train_org = "./train_data.csv"
train_up = "./train.csv"
valid_org = "./test_data_new.csv"def flat_accuracy(preds, labels):
    """A function for calculating accuracy scores"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return (np.sum( pred_flat == labels_flat))
valid_up = "./valid.csv"
#data_preprocess1(train_org,train_up)
#data_preprocess2(valid_org,valid_up)
step1(path,max_len,epoch)

36819 36819
3340 3340
1000 1000


Some weights of the model checkpoint at ./bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint 

In [73]:
import pandas as pd
import numpy as np
data = pd.read_csv(r'./train.csv')   #打开一个csv，得到data对象
print(data.columns)#获取列索引值
data1 = data['Labels']#获取name列的数据
data1[0] = 3
#data1[1] = '3'
data['Labels'] = data1 #将数据插入新列new
data.to_csv(r"./train.csv",mode = 'a',index =False)
#保存到csv,  mode=a，以追加模式写入,header表示列名，默认为true,index表示行名，默认为true，再次写入不需要行名
print(data)


Index(['Text', 'Labels'], dtype='object')
                                                 Text Labels
0                                  我就奇怪了  为啥你能拍得这么美呢       3
1                                   因为我做什么都认真，都诚心诚意！       2
2                     好你诚心诚意！我谦虚低调！咱都是优秀品格的人再赞一个  干杯       2
3                                       嗯嗯，咱俩都是最可爱的人！      2
4      是这是人家自己的事就算我能见到她也不会说你们分手吧什么的可是我真心不喜欢冯绍峰这个理由够吗       5
...                                               ...    ...
73634                                         那你想想就好了      6
73635                                      脸好小啊啊啊啊羡慕       2
73636                    很难，我都胖成麻瓜那样了，因为脸不长肉，谁都给我硬塞饭       3
73637                            你这太好看了，我脸也胖身上也胖呜呜呜呜       3
73638            匀称点好！我腰以下疯长肉，腰以皮包骨。难受，我有两张都是老u的衣服哈哈哈      1

[73639 rows x 2 columns]


In [72]:
data.to_csv(r"./train.csv",mode = 'a',index =False)

In [None]:
csv_writer.writerow([str, label])

In [11]:
a = "aabaaaba"
print(a.split('b'))
print(a[2])

['aa', 'aaa', 'a']
b


In [23]:
for i in range(10):
    x = i
print(x)

9


In [9]:
print(len(a.split('b')))

3
