In [1]:
import pandas as pd

#### 数据预处理

In [2]:
# 将str型类别转变为int型类别
mapping = {'Business':0, 'Sci/Tech':1, 'World':2, 'Sports':3}

data_label = pd.read_table('data/train_labels.txt', sep='\t')
data_label.columns = ['label']
data_label['label'] = data_label['label'].map(mapping)

data_label.to_csv("data/train_labels2.csv")

data_label['label'].value_counts()

3    30000
2    30000
1    30000
0    29999
Name: label, dtype: int64

In [3]:
data_text = pd.read_table("data/train_texts.txt", sep="\t")
data_text.columns = ['text']
data_text.head()

Unnamed: 0,text
0,Carlyle Looks Toward Commercial Aerospace (Reu...
1,Oil and Economy Cloud Stocks' Outlook (Reuters...
2,Iraq Halts Oil Exports from Main Southern Pipe...
3,Oil prices soar to all-time record posing new...
4,Stocks End Up But Near Year Lows (Reuters) Re...


In [4]:
#将文字特征与类别标签合并
data_train = pd.concat([data_text, data_label], axis=1, ignore_index=True)
data_train.columns = ['text','label']
data_train.head()

Unnamed: 0,text,label
0,Carlyle Looks Toward Commercial Aerospace (Reu...,0
1,Oil and Economy Cloud Stocks' Outlook (Reuters...,0
2,Iraq Halts Oil Exports from Main Southern Pipe...,0
3,Oil prices soar to all-time record posing new...,0
4,Stocks End Up But Near Year Lows (Reuters) Re...,0


In [5]:
data_train.to_csv("data/data_train.csv")

In [6]:
# 相同方法处理test数据集
data_label_test = pd.read_table("data/test_labels.txt", sep="\t")
data_label_test.columns = ['label']
data_label_test['label'] = data_label_test['label'].map(mapping)

In [7]:
data_text = pd.read_table("data/test_texts.txt", sep="\t")
data_text.columns = ['text']
data_text.head()

Unnamed: 0,text
0,The Race is On: Second Private Team Sets Launc...
1,Ky. Company Wins Grant to Study Peptides (AP) ...
2,Prediction Unit Helps Forecast Wildfires (AP) ...
3,Calif. Aims to Limit Farm-Related Smog (AP) AP...
4,Open Letter Against British Copyright Indoctri...


In [8]:
data_test = pd.concat([data_text, data_label_test], axis=1, ignore_index=True)
data_test.columns = ['text','label']
data_test.head()

Unnamed: 0,text,label
0,The Race is On: Second Private Team Sets Launc...,1
1,Ky. Company Wins Grant to Study Peptides (AP) ...,1
2,Prediction Unit Helps Forecast Wildfires (AP) ...,1
3,Calif. Aims to Limit Farm-Related Smog (AP) AP...,1
4,Open Letter Against British Copyright Indoctri...,1


In [9]:
data_test.to_csv("data/data_test.csv")

#### 用fastnlp进一步处理数据

In [10]:
from fastNLP import DataSet, Instance, Vocabulary

In [11]:
data_train = pd.read_csv("data/data_train.csv")
data_test = pd.read_csv("data/data_test.csv")

##### 将dataframe转为dataset格式

In [12]:
dataset_train = DataSet(data_train.to_dict(orient='list'))
dataset_test = DataSet(data_test.to_dict(orient='list'))

In [13]:
# 将文本的大写转为小写，确保标签为int格式
dataset_train.apply(lambda x: x['text'].lower(), new_field_name='text')
dataset_train.apply(lambda x: int(x['label']), new_field_name='label')

dataset_test.apply(lambda x: x['text'].lower(), new_field_name='text')
dataset_test.apply(lambda x: int(x['label']), new_field_name='label')

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 0,
 2,
 1,
 3,
 3,
 2,
 1,
 2,
 3,
 2,
 3,
 2,
 1,
 0,
 1,
 2,
 2,
 0,
 0,
 3,
 3,
 3,
 1,
 2,
 1,
 2,
 2,
 3,
 2,
 1,
 1,
 1,
 2,
 1,
 3,
 2,
 3,
 2,
 2,
 2,
 3,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 1,
 2,
 0,
 1,
 0,
 3,
 3,
 3,
 0,
 2,
 0,
 3,
 0,
 1,
 0,
 2,
 3,
 2,
 3,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 0,
 3,
 2,
 3,
 1,
 2,
 1,
 0,
 3,
 3,
 3,
 2,
 2,
 2,
 3,
 2,
 3,
 3,
 2,
 3,
 3,
 2,
 3,
 2,
 3,
 2,
 2,
 2,
 3,
 0,
 3,
 3,
 3,
 2,
 3,
 2,
 2,
 2,
 0,
 3,
 3,
 2,
 3,
 0,
 1,
 2,
 2,
 0,
 0,
 0,
 3,
 2,
 1,
 1,
 0,
 1,
 2,
 2,
 1,
 3,
 1,
 3,
 0,
 3,
 3,
 1,
 0,
 2,
 1,
 2,
 3,
 1,
 1,
 2,
 2,
 2,
 0,
 0,
 2,
 3,
 0,
 3,
 1,
 1,
 1,
 2,
 3,
 3,
 3,
 1,
 3,
 1,
 3,
 2,
 3,
 3,
 3,
 1,
 0,
 0,
 1,
 3,
 3,
 2,
 0,
 3,
 0,
 0,
 2,
 3,
 0,
 2,
 3,
 3,
 0,
 1,
 0,
 3,
 0,
 3,
 2,
 0,
 1,
 3,
 3,
 1,
 0,
 0,
 1,
 1,
 1,
 2,
 0,


In [14]:
# 将text分词
def cut_word(instance):
    return instance['text'].split()

dataset_train.apply(cut_word, new_field_name='splited_words')
dataset_test.apply(cut_word, new_field_name='splited_words')

[['the',
  'race',
  'is',
  'on:',
  'second',
  'private',
  'team',
  'sets',
  'launch',
  'date',
  'for',
  'human',
  'spaceflight',
  '(space.com)',
  'space.com',
  '-',
  'toronto',
  'canada',
  '--',
  'a',
  'second\\team',
  'of',
  'rocketeers',
  'competing',
  'for',
  'the',
  '#36;10',
  'million',
  'ansari',
  'x',
  'prize',
  'a',
  'contest',
  'for\\privately',
  'funded',
  'suborbital',
  'space',
  'flight',
  'has',
  'officially',
  'announced',
  'the',
  'first\\launch',
  'date',
  'for',
  'its',
  'manned',
  'rocket.'],
 ['ky.',
  'company',
  'wins',
  'grant',
  'to',
  'study',
  'peptides',
  '(ap)',
  'ap',
  '-',
  'a',
  'company',
  'founded',
  'by',
  'a',
  'chemistry',
  'researcher',
  'at',
  'the',
  'university',
  'of',
  'louisville',
  'won',
  'a',
  'grant',
  'to',
  'develop',
  'a',
  'method',
  'of',
  'producing',
  'better',
  'peptides',
  'which',
  'are',
  'short',
  'chains',
  'of',
  'amino',
  'acids',
  'the',
  '

In [15]:
# 统计分词后的每句话长度，得到最大长度，以此来padding
dataset_train.apply(lambda x: len(x['splited_words']), new_field_name='seq_len')
dataset_test.apply(lambda x: len(x['splited_words']), new_field_name='seq_len')

[48,
 45,
 55,
 34,
 136,
 103,
 113,
 17,
 33,
 48,
 42,
 43,
 23,
 18,
 21,
 41,
 52,
 45,
 24,
 24,
 18,
 17,
 67,
 40,
 25,
 31,
 67,
 69,
 69,
 35,
 32,
 44,
 52,
 55,
 64,
 24,
 40,
 44,
 46,
 38,
 41,
 49,
 47,
 51,
 31,
 37,
 50,
 42,
 33,
 47,
 48,
 40,
 26,
 47,
 28,
 41,
 43,
 30,
 46,
 38,
 33,
 64,
 32,
 43,
 25,
 41,
 47,
 33,
 41,
 37,
 59,
 26,
 20,
 46,
 42,
 43,
 26,
 46,
 21,
 35,
 23,
 42,
 56,
 43,
 42,
 51,
 46,
 72,
 63,
 38,
 21,
 37,
 33,
 30,
 39,
 37,
 50,
 48,
 48,
 46,
 44,
 44,
 15,
 51,
 36,
 63,
 28,
 44,
 27,
 49,
 53,
 32,
 48,
 35,
 31,
 35,
 85,
 46,
 41,
 50,
 37,
 38,
 48,
 66,
 29,
 27,
 32,
 45,
 34,
 41,
 43,
 40,
 17,
 36,
 59,
 26,
 50,
 43,
 26,
 34,
 27,
 44,
 41,
 36,
 24,
 37,
 39,
 42,
 45,
 26,
 48,
 31,
 33,
 49,
 22,
 50,
 45,
 40,
 42,
 44,
 41,
 47,
 42,
 42,
 46,
 39,
 41,
 37,
 34,
 94,
 31,
 46,
 55,
 52,
 59,
 39,
 25,
 47,
 25,
 46,
 25,
 21,
 35,
 53,
 42,
 35,
 30,
 31,
 21,
 55,
 50,
 42,
 23,
 36,
 33,
 55,
 35,
 36,
 33,
 7

In [16]:
import numpy as np
print(np.quantile(dataset_train['seq_len'], 0.95))
print(np.quantile(dataset_test['seq_len'], 0.95))
print(np.quantile(dataset_train['seq_len'], 0.99))
print(np.quantile(dataset_test['seq_len'], 0.99))

53.0
52.0
70.0
69.0


#### 99%的文本长度在70左右，所以取max_seq_len=70

In [17]:
#max_seq_len_train = max(dataset_train['seq_len'])
#max_seq_len_test = max(dataset_test['seq_len'])

#max_seq_len = max_seq_len_train if max_seq_len_train > max_seq_len_test else max_seq_len_test
#print(max_seq_len)

In [18]:
# 建立word idx词典
vocab = Vocabulary(min_freq=2)

In [19]:
vocab.from_dataset(dataset_train,
                   dataset_test,
                   field_name='splited_words'
                  )

vocab.index_dataset(dataset_train, field_name='splited_words', new_field_name='idx_words')
vocab.index_dataset(dataset_test, field_name='splited_words', new_field_name='idx_words')

Vocabulary(['carlyle', 'looks', 'toward', 'commercial', 'aerospace']...)

In [20]:
# padding补齐句子长度
max_seq_len = 70

def padding_seq_len(instance):
    if len(instance['idx_words']) <= max_seq_len:
        idx_words = instance['idx_words'] + [0]*(max_seq_len-len(instance['idx_words']))
    else:
        idx_words = instance['idx_words'][:max_seq_len]
    return idx_words

In [21]:
dataset_train.apply(padding_seq_len, new_field_name='idx_words')
dataset_test.apply(padding_seq_len, new_field_name='idx_words')

[[2,
  509,
  17,
  29708,
  109,
  892,
  104,
  1053,
  376,
  1829,
  9,
  705,
  11711,
  3668,
  3650,
  10,
  723,
  595,
  44,
  4,
  1,
  5,
  30025,
  3975,
  9,
  2,
  9001,
  76,
  6100,
  2008,
  1209,
  4,
  3753,
  1,
  8538,
  15474,
  198,
  823,
  20,
  2330,
  117,
  2,
  1,
  1829,
  9,
  16,
  4425,
  20298,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [11747,
  53,
  369,
  4249,
  3,
  578,
  36094,
  42,
  45,
  10,
  4,
  53,
  7658,
  19,
  4,
  10868,
  5143,
  15,
  2,
  600,
  5,
  4172,
  215,
  4,
  4249,
  3,
  1308,
  4,
  7386,
  5,
  4313,
  634,
  36094,
  85,
  35,
  906,
  7962,
  5,
  41226,
  26569,
  2,
  1099,
  4403,
  5,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [9979,
  638,
  1970,
  715,
  32916,
  42,
  45,
  10,
  789,
  3315,
  4868,
  77,
  1051,
  31325,
  1723,
  25,
  2199,
  13,

In [22]:
# 设置网络的输入与标签
dataset_train.set_input('idx_words')
dataset_train.set_target('label')
dataset_test.set_input('idx_words')
dataset_test.set_target('label')

+------------+-----------------+-------+-------------------+---------+-----------------+
| Unnamed: 0 | text            | label | splited_words     | seq_len | idx_words       |
+------------+-----------------+-------+-------------------+---------+-----------------+
| 0          | the race is ... | 1     | ['the', 'race'... | 48      | [2, 509, 17,... |
| 1          | ky. company ... | 1     | ['ky.', 'compa... | 45      | [11747, 53, ... |
| 2          | prediction u... | 1     | ['prediction',... | 55      | [9979, 638, ... |
| 3          | calif. aims ... | 1     | ['calif.', 'ai... | 34      | [943, 1599, ... |
| 4          | open letter ... | 1     | ['open', 'lett... | 136     | [147, 3659, ... |
| 5          | loosing the ... | 1     | ['loosing', 't... | 103     | [1, 2, 277, ... |
| 6          | foafkey: foa... | 1     | ['foafkey:', '... | 113     | [1, 36853, 4... |
| 7          | e-mail scam ... | 1     | ['e-mail', 'sc... | 17      | [717, 4797, ... |
| 8          | card f

In [23]:
from fastNLP.embeddings import StaticEmbedding

In [24]:
embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300d')

  0%|          | 98.3k/2.18G [00:00<50:49, 714kB/s]

http://212.129.155.247/embedding/glove.840B.300d.zip not found in cache, downloading to /var/folders/3t/rkcyv7gj2zzc07y_vbtsz8lm0000gn/T/tmp1xc1ob5j


100%|██████████| 2.18G/2.18G [02:05<00:00, 17.4MB/s]


Finish download from http://212.129.155.247/embedding/glove.840B.300d.zip
Copy file to /Users/zhoujianyao/.fastNLP/embedding/glove.840B.300d
Found 46384 out of 71264 words in the pre-training embedding.




In [25]:
dataset_train.rename_field('idx_words', 'idx_words_seq')
dataset_train.rename_field('label', 'label_seq')
dataset_test.rename_field('idx_words', 'idx_words_seq')
dataset_test.rename_field('label', 'label_seq')

+------------+-----------------+-------------------+---------+-------------------+-----------+
| Unnamed: 0 | text            | splited_words     | seq_len | idx_words_seq     | label_seq |
+------------+-----------------+-------------------+---------+-------------------+-----------+
| 0          | the race is ... | ['the', 'race'... | 48      | [2, 509, 17, 2... | 1         |
| 1          | ky. company ... | ['ky.', 'compa... | 45      | [11747, 53, 36... | 1         |
| 2          | prediction u... | ['prediction',... | 55      | [9979, 638, 19... | 1         |
| 3          | calif. aims ... | ['calif.', 'ai... | 34      | [943, 1599, 3,... | 1         |
| 4          | open letter ... | ['open', 'lett... | 136     | [147, 3659, 50... | 1         |
| 5          | loosing the ... | ['loosing', 't... | 103     | [1, 2, 277, 8,... | 1         |
| 6          | foafkey: foa... | ['foafkey:', '... | 113     | [1, 36853, 427... | 1         |
| 7          | e-mail scam ... | ['e-mail', 'sc...

In [26]:
dataset_train.set_input('idx_words_seq')
dataset_train.set_target('label_seq')
dataset_test.set_input('idx_words_seq')
dataset_test.set_target('label_seq')

+------------+-----------------+-------------------+---------+-------------------+-----------+
| Unnamed: 0 | text            | splited_words     | seq_len | idx_words_seq     | label_seq |
+------------+-----------------+-------------------+---------+-------------------+-----------+
| 0          | the race is ... | ['the', 'race'... | 48      | [2, 509, 17, 2... | 1         |
| 1          | ky. company ... | ['ky.', 'compa... | 45      | [11747, 53, 36... | 1         |
| 2          | prediction u... | ['prediction',... | 55      | [9979, 638, 19... | 1         |
| 3          | calif. aims ... | ['calif.', 'ai... | 34      | [943, 1599, 3,... | 1         |
| 4          | open letter ... | ['open', 'lett... | 136     | [147, 3659, 50... | 1         |
| 5          | loosing the ... | ['loosing', 't... | 103     | [1, 2, 277, 8,... | 1         |
| 6          | foafkey: foa... | ['foafkey:', '... | 113     | [1, 36853, 427... | 1         |
| 7          | e-mail scam ... | ['e-mail', 'sc...

In [27]:
import torch
from torch import nn

In [28]:
class DPCNN(nn.Module):
    def __init__(self, vocab_len, embed_size, max_seq_len, num_classes, pre_embed=None):
        super(DPCNN, self).__init__()
        self.vocab_len = vocab_len
        self.embed_size = embed_size
        self.max_seq_len = max_seq_len
        self.num_classes=num_classes
        self.channel_size = 250
        
        self.embedding = nn.Embedding(self.vocab_len, self.embed_size)
        if pre_embed is not None:
            pretrained_weight = np.array(pre_embed)
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_weight))
        else:
            nn.init.xavier_normal_(self.embedding.weight.data)
            
        self.embedding.weight.requires_grad = True
        # region embedding
        self.region_embedding = nn.Sequential(
            nn.Conv1d(self.embed_size, self.channel_size, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        # conv blocks，两次卷积channel维持不变
        self.conv_block = nn.Sequential(
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),
        )
        # 金字塔卷积resnet blocks, 缩减一半维度
        self.maxpool = nn.Sequential(
            nn.ConstantPad1d(padding=(0, 1), value=0),  # 在“下面”填充一个0，为了maxpool凑整数
            nn.MaxPool1d(kernel_size=3, stride=2)
        )
        self.conv = nn.Sequential(
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),
        )
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self.channel_size, self.num_classes)
         )
        
    def ResnetBlock(self, x):
        x_shortcut = self.maxpool(x)
        x = self.conv(x_shortcut)
        x = x + x_shortcut
        return x
    
    def forward(self, idx_words_seq):
        x = self.embedding(idx_words_seq) # [batch_size, seq_len, embed_size]
        x = x.permute(0, 2, 1) # [batch_size, embed_size, seq_len] 70
        x = self.region_embedding(x) # [batch_size, channel_size, seq_len] 70
        x = self.conv_block(x) # [batch_size, channel_size, seq_len] 70
        
        # 最后seq_len会缩减到1
        while x.size()[2] > 1:
            x = self.ResnetBlock(x)

        # x = x.permute(0, 2, 1) # [batch_size, 1, channel_size]
        # x = x.contiguous().view(x.size(0), -1)
        x = x.squeeze(2)
        
        output = self.fc(x)
        return {'output': output}
    
    def predict(self, idx_words_seq):
        output = self(idx_words_seq)
        _, predict = output['output'].max(dim=1)
        return {'predict': predict}

In [34]:
model = DPCNN(vocab_len=embed.num_embedding,
              embed_size=300,
              max_seq_len=max_seq_len,
              num_classes=4,
              pre_embed=embed.embedding.state_dict()['weight'])

In [35]:
save_path = 'model/'

from fastNLP import Trainer, Tester
from copy import deepcopy
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric
from fastNLP.core.optimizer import Adam

In [36]:
loss = CrossEntropyLoss(pred="output", target="label_seq")
metric = AccuracyMetric(pred="predict", target="label_seq")

trainer = Trainer(model=model,
                  train_data=dataset_train,
                  dev_data=dataset_test,
                  loss=loss,
                  metrics=metric,
                  save_path=save_path,
                  batch_size=64,
                  n_epochs=5,
                  optimizer=Adam(lr=0.001, weight_decay=0.0001)
                 )

trainer.train()

input fields after batch(if batch size is 2):
	idx_words_seq: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 70]) 
target fields after batch(if batch size is 2):
	label_seq: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 

training epochs started 2020-06-07-22-02-04-187285


HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=9375), HTML(value='')), layout=Layout(display…

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=119), HTML(value='')), layout=Layout(display=…

Evaluate data in 16.33 seconds!
Evaluation on dev at Epoch 1/5. Step:1875/9375: 
AccuracyMetric: acc=0.909725



  "type " + obj.__name__ + ". It won't be checked "


HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=119), HTML(value='')), layout=Layout(display=…

Evaluate data in 16.31 seconds!
Evaluation on dev at Epoch 2/5. Step:3750/9375: 
AccuracyMetric: acc=0.906567



HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=119), HTML(value='')), layout=Layout(display=…

Evaluate data in 20.54 seconds!
Evaluation on dev at Epoch 3/5. Step:5625/9375: 
AccuracyMetric: acc=0.918542



HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=119), HTML(value='')), layout=Layout(display=…

Evaluate data in 16.17 seconds!
Evaluation on dev at Epoch 4/5. Step:7500/9375: 
AccuracyMetric: acc=0.905514



HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=119), HTML(value='')), layout=Layout(display=…

Evaluate data in 16.41 seconds!
Evaluation on dev at Epoch 5/5. Step:9375/9375: 
AccuracyMetric: acc=0.91012

Reloaded the best model.

In Epoch:3/Step:5625, got best dev performance:
AccuracyMetric: acc=0.918542


{'best_eval': {'AccuracyMetric': {'acc': 0.918542}},
 'best_epoch': 3,
 'best_step': 5625,
 'seconds': 6969.22}

In [48]:
load_model = torch.load(save_path+'best_DPCNN_acc_2020-06-07-22-02-04-187285')

In [49]:
tester = Tester(data=dataset_test,
                model=load_model,
                metrics=AccuracyMetric(pred="predict", target="label_seq"),
                batch_size=4)

tester.test()

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=1900), HTML(value='')), layout=Layout(display…

Evaluate data in 27.52 seconds!
[tester] 
AccuracyMetric: acc=0.918542


{'AccuracyMetric': {'acc': 0.918542}}