In [82]:
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms,datasets
import os,PIL,pathlib,warnings
warnings.filterwarnings("ignore")

In [83]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [84]:
import pandas as pd
train_data = pd.read_csv('train.csv',sep='\t',header=None)
train_data.head()

Unnamed: 0,0,1
0,还有双鸭山到淮阴的汽车票吗13号的,Travel-Query
1,从这里怎么回家,Travel-Query
2,随便播放一首专辑阁楼里的佛里的歌,Music-Play
3,给看一下墓王之王嘛,FilmTele-Play
4,我想看挑战两把s686打突变团竞的游戏视频,Video-Play


In [85]:
# 构造数据迭代器

In [86]:
def custom_data_iter(texts,labels):
    for x,y in zip(texts,labels):
        yield x,y
x = train_data.iloc[:,0]
y = train_data.iloc[:,1]

In [87]:
from gensim.models.word2vec import Word2Vec
import numpy as np 

In [88]:
w2v = Word2Vec(vector_size=128,min_count=3)
w2v.build_vocab(x)
w2v.train(x,total_examples=w2v.corpus_count,epochs=20)

(2733792, 3663560)

In [89]:
from torch.utils import data

In [90]:
def average_vec(text):
    vec = np.zeros(128).reshape((1,128))
    for word in text:
        try:
            vec += w2v.wv[word].reshape((1,128))
        except KeyError:
            continue
    return vec

In [91]:
x_vec = np.concatenate([average_vec(z) for z in x])
w2v.save('w2v_model.pkl')

In [92]:
train_iter = custom_data_iter(x_vec,y)

In [93]:
len(x),len(x_vec)

(12100, 12100)

In [94]:
label_name = list(set(train_data.iloc[:,1]))

In [95]:
label_name

['Travel-Query',
 'Radio-Listen',
 'Audio-Play',
 'HomeAppliance-Control',
 'Music-Play',
 'Video-Play',
 'Calendar-Query',
 'FilmTele-Play',
 'Weather-Query',
 'TVProgram-Play',
 'Other',
 'Alarm-Update']

In [96]:
text_pipline = lambda x:average_vec(x)
label_pipline = lambda x:label_name.index(x)

In [97]:
text_pipline("全民制作人大家好")

array([[-9.01413970e-02, -1.64767406e+00,  8.76959436e-01,
         5.05374588e-01,  2.90221609e+00,  7.03509612e+00,
         2.07671969e+00,  5.33390149e-01, -5.75281238e+00,
         2.72822993e+00, -1.21613883e+00,  2.40359181e+00,
         1.24026239e+00, -2.98222235e+00, -4.15345851e+00,
         4.80561630e+00, -4.62051204e+00, -8.30729213e-02,
         4.58210759e-01,  1.42162828e+00, -1.16396460e+00,
        -3.75022659e+00,  4.05132301e-01, -8.35293084e-02,
        -1.99540162e+00, -1.08039442e-01,  2.51429046e+00,
         1.07862586e+00, -1.27408116e+00,  5.88607416e-01,
         4.01399051e+00,  2.36083227e+00, -4.06493633e+00,
         1.19494113e+00,  1.57056513e+00,  3.98427185e+00,
        -8.12975377e-01,  4.63803515e+00, -3.09757201e+00,
        -3.16654840e+00,  1.68812885e+00, -1.63820309e+00,
        -2.55383961e-02, -2.27370448e-02,  3.69118452e+00,
         2.23280789e+00,  9.95020710e-01,  7.63795832e+00,
        -2.81443581e-01, -2.81667975e+00, -2.48193783e+0

In [98]:
label_pipline("Travel-Query")

0

In [99]:
y2label = [label_pipline(_label) for _label in y]

In [100]:
from torch.utils import data

In [101]:
class Dataset(data.Dataset):
    def __init__(self,x_vec,y2label):
        super().__init__()
        self.x = x_vec
        self.y = y2label
    def __len__(self):
        return len(self.x)
    def __getitem__(self,index):
        return_x = torch.tensor(self.x[index],dtype=torch.float32)
        return_y = torch.tensor(self.y[index],dtype=torch.int64)
        return return_x.to(device),return_y.to(device)

In [102]:
from torch import nn

class TextClassficationModel(nn.Module):
    
    def __init__(self,num_class):
        super().__init__()
        self.fc = nn.Linear(128,num_class)
    
    def forward(self,text):
        return self.fc(text)

In [103]:
num_class = len(label_name)
vocab_size = 1e5
model = TextClassficationModel(num_class).to(device)

In [104]:
import time
def train(dataloader):
    model.train() # 切换为训练模式
    total_acc,train_loss, total_count = 0, 0, 0
    log_interval = 50
    start_time = time.time()
    for idx, (text,label) in enumerate(dataloader):
        predicted_label = model(text)

        optimizer.zero_grad() # grad属性归零
        loss = criterion(predicted_label, label) # 计算网络输出和真实值之间的差距
        loss.backward() # 反向传播
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) # 梯度裁剪
        optimizer.step() # 每一步自动更新

        # 记录acc与loss
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        train_loss += loss.item()
        total_count += label.size(0)

        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            #print('| epoch {:1d} | {:4f}/{:4f} batches ''| train_acc {:4.3f} train_loss {:4.5f}'.format(epoch, idx,total_acc/total_count, train_loss/total_count))
            total_acc, train_loss, total_count = 0, 0, 0
            start_time = time.time()
def evaluate(dataloader):
    model.eval() # 切换为测试模式
    total_acc,train_loss, total_count = 0, 0, 0
    with torch.no_grad():
        for idx, (text,label) in enumerate(dataloader):
            predicted_label = model(text)

            loss = criterion(predicted_label, label) # 计算loss值
            # 记录测试数据
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            train_loss += loss.item()
            total_count += label.size(0)

    return total_acc/total_count, train_loss/total_count

In [109]:
from torch.utils.data.dataset import random_split
# 超参数
EPOCHS = 10 # epoch
LR = 5 # 学习率
BATCH_SIZE = 64 # batch size for training
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_dataset = Dataset(x_vec,y2label)
spilt_train,split_valid =  random_split(train_dataset,[int(len(train_dataset)*0.8),len(train_dataset)-int(len(train_dataset)*0.8)])
train_data_loader = data.DataLoader(spilt_train, batch_size=10, shuffle=True)
valid_data_loader = data.DataLoader(split_valid, batch_size=10, shuffle=True)

In [106]:
# 训练循环
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)  # 训练集的大小
    num_batches = len(dataloader)   # 批次数目, (size/batch_size，向上取整)

    train_loss, train_acc = 0, 0  # 初始化训练损失和正确率
    
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)
        
        # 计算预测误差
        pred = model(X)          # 网络输出
        loss = loss_fn(pred, y)  # 计算网络输出和真实值之间的差距，targets为真实值，计算二者差值即为损失
        
        # 反向传播
        optimizer.zero_grad()  # grad属性归零
        loss.backward()        # 反向传播
        optimizer.step()       # 每一步自动更新
        
        # 记录acc与loss
        train_acc  += (pred.argmax(1) == y).type(torch.float).sum().item()
        train_loss += loss.item()
            
    train_acc  /= size
    train_loss /= num_batches

    return train_acc, train_loss

In [110]:
def test (dataloader, model, loss_fn):
    size        = len(dataloader.dataset)  # 测试集的大小
    num_batches = len(dataloader)          # 批次数目, (size/batch_size，向上取整)
    test_loss, test_acc = 0, 0
    
    # 当不进行训练时，停止梯度更新，节省计算内存消耗
    with torch.no_grad():
        for sent, target in dataloader:
            sent, target = sent.to(device), target.to(device)
            
            # 计算loss
            target_pred = model(sent)
            loss        = loss_fn(target_pred, target)
            
            test_loss += loss.item()
            test_acc  += (target_pred.argmax(1) == target).type(torch.float).sum().item()

    test_acc  /= size
    test_loss /= num_batches

    return test_acc, test_loss

In [111]:
import copy

optimizer  = torch.optim.Adam(model.parameters(), lr= 1e-4)
loss_fn    = nn.CrossEntropyLoss() # 创建损失函数

epochs     = 10

train_loss = []
train_acc  = []
test_loss  = []
test_acc   = []

best_acc = 0    # 设置一个最佳准确率，作为最佳模型的判别指标

for epoch in range(epochs):
    
    model.train()
    epoch_train_acc, epoch_train_loss = train(train_data_loader, model, loss_fn, optimizer)
    
    model.eval()
    epoch_test_acc, epoch_test_loss = test(valid_data_loader, model, loss_fn)
    
    # 保存最佳模型到 best_model
    if epoch_test_acc > best_acc:
        best_acc   = epoch_test_acc
        best_model = copy.deepcopy(model)
    
    train_acc.append(epoch_train_acc)
    train_loss.append(epoch_train_loss)
    test_acc.append(epoch_test_acc)
    test_loss.append(epoch_test_loss)
    
    # 获取当前的学习率
    lr = optimizer.state_dict()['param_groups'][0]['lr']
    
    template = ('Epoch:{:2d}, Train_acc:{:.1f}%, Train_loss:{:.3f}, Test_acc:{:.1f}%, Test_loss:{:.3f}, Lr:{:.2E}')
    print(template.format(epoch+1, epoch_train_acc*100, epoch_train_loss, 
                          epoch_test_acc*100, epoch_test_loss, lr))
    
# 保存最佳模型到文件中
# PATH = './best_model.pth'  # 保存的参数文件名
# torch.save(model.state_dict(), PATH)

print('Done')

Epoch: 1, Train_acc:89.6%, Train_loss:0.883, Test_acc:88.8%, Test_loss:0.931, Lr:1.00E-04
Epoch: 2, Train_acc:89.7%, Train_loss:0.851, Test_acc:88.9%, Test_loss:0.910, Lr:1.00E-04
Epoch: 3, Train_acc:89.7%, Train_loss:0.822, Test_acc:88.8%, Test_loss:0.897, Lr:1.00E-04
Epoch: 4, Train_acc:89.8%, Train_loss:0.796, Test_acc:88.8%, Test_loss:0.883, Lr:1.00E-04
Epoch: 5, Train_acc:89.9%, Train_loss:0.770, Test_acc:88.8%, Test_loss:0.870, Lr:1.00E-04
Epoch: 6, Train_acc:90.0%, Train_loss:0.747, Test_acc:88.8%, Test_loss:0.848, Lr:1.00E-04
Epoch: 7, Train_acc:90.0%, Train_loss:0.726, Test_acc:88.8%, Test_loss:0.843, Lr:1.00E-04
Epoch: 8, Train_acc:90.1%, Train_loss:0.707, Test_acc:88.9%, Test_loss:0.827, Lr:1.00E-04
Epoch: 9, Train_acc:90.2%, Train_loss:0.688, Test_acc:88.8%, Test_loss:0.815, Lr:1.00E-04
Epoch:10, Train_acc:90.2%, Train_loss:0.671, Test_acc:88.8%, Test_loss:0.813, Lr:1.00E-04
Done
