In [193]:
import pandas as pd
import numpy as np
import torch
import time
import random
import os

## 查看数据集分布

In [194]:
pd.read_csv("data/train_one_label.csv").head(2)

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0


In [195]:
pd.read_csv("data/test.csv").head(2)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...


## 使用torchtext构建数据集

In [196]:
from torchtext import data
from torchtext.vocab import Vectors
from torch.nn import init
from tqdm import tqdm

In [197]:
tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=200)
LABEL = data.Field(sequential=False, use_vocab=False)

In [201]:
train_path = 'data/train_one_label.csv'
valid_path = "data/valid_one_label.csv"
test_path = "data/test.csv"

# 定义Dataset
class MyDataset(data.Dataset):
    name = 'Grand Dataset'

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    def __init__(self, path, text_field, label_field, test=False, aug=False, **kwargs):
        fields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", text_field), ("toxic", label_field)]
        
        examples = []
        csv_data = pd.read_csv(path)
        print('read data from {}'.format(path))

        if test:
            # 如果为测试集，则不加载label
            for text in tqdm(csv_data['comment_text']):
                examples.append(data.Example.fromlist([None, text, None], fields))
        else:
            for text, label in tqdm(zip(csv_data['comment_text'], csv_data['toxic'])):
                if aug:
                    # do augmentation
                    rate = random.random()
                    if rate > 0.5:
                        text = self.dropout(text)
                    else:
                        text = self.shuffle(text)
                # Example: Defines a single training or test example.Stores each column of the example as an attribute.
                examples.append(data.Example.fromlist([None, text, label], fields))
        # 之前是一些预处理操作，此处调用super调用父类构造方法，产生标准Dataset
        # super(MyDataset, self).__init__(examples, fields, **kwargs)
        super(MyDataset, self).__init__(examples, fields)

    def shuffle(self, text):
        text = np.random.permutation(text.strip().split())
        return ' '.join(text)

    def dropout(self, text, p=0.5):
        # random delete some text
        text = text.strip().split()
        len_ = len(text)
        indexs = np.random.choice(len_, int(len_ * p))
        for i in indexs:
            text[i] = ''
        return ' '.join(text)



    # 若不想自定义继承自Dataset的类MyDataset，也可直接使用torchtext.data.Dataset来构建数据集
    # 完整示例如下
    train_examples = []
    test_examples = []
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True)
    LABEL = data.Field(sequential=False, use_vocab=False)
    train_fields = [("id", None), # we won't be needing the id, so we pass in None as the field
                     ("comment_text", TEXT), ("toxic", LABEL)]
    test_fileds= [("id", None), # we won't be needing the id, so we pass in None as the field
                     ("comment_text", TEXT), ("toxic", None)]
    train_data = pd.read_csv('data/train_one_label.csv')
    test_data = pd.read_csv("data/test.csv")
    for text in tqdm(train_data['comment_text']):
        train_examples.append(data.Example.fromlist([None, text, None], train_fields))
    for text in tqdm(test_data['comment_text']):
        test_examples.append(data.Example.fromlist([None, text, None], test_fields))
    # 直接使用Dataset来构造数据集
    train = data.Dataset(train_examples, train_fields)
    test = data.Dataset(test_examples, test_fields)
    


### 构建数据集

In [202]:
train = MyDataset(train_path, text_field=TEXT, label_field=LABEL, test=False, aug=1)
valid = MyDataset(valid_path, text_field=TEXT, label_field=LABEL, test=False, aug=1)
# 因为test没有label,需要指定label_field为None
test = MyDataset(test_path, text_field=TEXT, label_field=None, test=True, aug=1)

25it [00:00, 7457.87it/s]
25it [00:00, 8979.84it/s]
100%|██████████| 33/33 [00:00<00:00, 31307.86it/s]

read data from data/train_one_label.csv
read data from data/valid_one_label.csv
read data from data/test.csv





In [204]:
print(train[0].__dict__.keys())
print(test[0].__dict__.keys())

dict_keys(['comment_text', 'toxic'])
dict_keys(['comment_text'])


In [205]:
print(train[0].comment_text)

['explanation', 'the', 'edits', 'made', 'under', 'my', 'hardcore', 'metallica', 'fan', 'were', 'vandalisms,', 'just', 'on', 'gas', 'after', 'at', 'york', 'dolls', 'fac.', 'and', "don't", 'remove', 'template', 'from', 'the', 'since', "i'm"]


### 构建词表，最简单的方式

In [206]:
TEXT.build_vocab(train)

###  通过预训练的词向量来构建词表的方式示例，以glove.6B.300d词向量为例
    cache = 'mycache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    vectors = Vectors(name='/Users/wyw/Documents/vectors/glove/glove.6B.300d.txt', cache=cache)
    # 指定 Vector 缺失值的初始化方式，没有命中的token的初始化方式
    vectors.unk_init = init.xavier_uniform_ 
    TEXT.build_vocab(train, min_freq=5, vectors=vectors)
    # 查看词表元素
    TEXT.vocab.vectors
    

In [207]:
TEXT.vocab.freqs.most_common(10)

[('the', 62),
 ('to', 33),
 ('you', 27),
 ('of', 24),
 ('a', 24),
 ('and', 21),
 ('is', 19),
 ('if', 18),
 ('that', 17),
 ('on', 16)]

### 构建数据集迭代器

In [208]:
from torchtext.data import Iterator, BucketIterator
train_iter, val_iter = BucketIterator.splits(
        (train, valid), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(8, 8),
        device=-1, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(test, batch_size=8, device=-1, sort=False, sort_within_batch=False, repeat=False)

# 使用torchtext构建的数据集实现LSTM
- 因数据集太小，无法收敛，只作为demo熟悉torchtext和pytorch之间的用法

In [209]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [217]:
weight_matrix = TEXT.vocab.vectors

In [214]:
class LSTM(nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()
        self.word_embeddings = nn.Embedding(len(TEXT.vocab), 300)  # embedding之后的shape: torch.Size([200, 8, 300])
        # 若使用预训练的词向量，需在此处指定预训练的权重
        # embedding.weight.data.copy_(weight_matrix)
        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1)  # torch.Size([200, 8, 128])
        self.decoder = nn.Linear(128, 2)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out = self.lstm(embeds)[0]  # lstm_out:200x8x128
        # 取最后一个时间步
        final = lstm_out[-1]  # 8*128
        y = self.decoder(final)  # 8*2 
        return y


In [215]:
model = LSTM()
model.train()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
loss_funtion = F.cross_entropy

In [216]:
for epoch, batch in enumerate(train_iter):
    optimizer.zero_grad()
    start = time.time()
    # text = batch.text.permute(1, 0)
    predicted = model(batch.comment_text)

    loss = loss_funtion(predicted, batch.toxic)
    loss.backward()
    # utils.clip_gradient(optimizer, opt.grad_clip)
    optimizer.step()
    print(loss)

tensor(0.7022, grad_fn=<NllLossBackward>)
tensor(0.0197, grad_fn=<NllLossBackward>)
tensor(0.0046, grad_fn=<NllLossBackward>)
tensor(0.0702, grad_fn=<NllLossBackward>)
