https://github.com/DSKSD/DeepNLP-models-Pytorch

https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/notebooks/08.CNN-for-Text-Classification.ipynb

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
import re
from copy import deepcopy
flatten = lambda l: [word for sentence in l for word in sentence.split()]
random.seed(1024)

In [2]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [4]:
def pad_to_batch(batch):
    x,y = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].size(1) < max_x:
            x_p.append(torch.cat([x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p), torch.cat(y).view(-1)

In [5]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

# Data processing

In [6]:
import pandas as pd
from nltk.stem import WordNetLemmatizer

In [7]:
train_data = pd.read_csv("train.csv")
print(f'{train_data.shape} train data shape')
train_data.head(5)

(19579, 3) train data shape


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [8]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
stopwords = nltk.corpus.stopwords.words('english')
train_features = []
train_labels = []

In [9]:
for sentence_index in range(train_data.shape[0]):
    sentence = train_data.text[sentence_index]
    author = train_data.author[sentence_index]
    
    # tokenize
    sentence_words = tokenizer.tokenize(sentence)
    
    # remove stopwords, covert to lowercase
    sentence_words = [word.lower() for word in sentence_words if word.lower() not in stopwords]
    
    # remove numbers
    words = [re.sub('[0-9]+', '', token) for token in sentence_words]
    
    # lemmatizing
    lemmatizer = WordNetLemmatizer()
    sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
    
    train_features.append(' '.join(sentence_words))
    train_labels.append(author)
    
print(f'extracted {len(train_features)} training data features and labels')

extracted 19579 training data features and labels


In [10]:
X = train_features
y = train_labels

In [11]:
vocab = list(set(flatten(X)))

In [12]:
len(vocab)

22040

In [13]:
len(set(y))

3

In [14]:
word2index={'<PAD>': 0, '<UNK>': 1}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

target2index = {}

for cl in set(y):
    if target2index.get(cl) is None:
        target2index[cl] = len(target2index)

index2target = {v:k for k, v in target2index.items()}

In [15]:
X_p, y_p = [], []
c = 0
for pair in zip(X,y):
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
    y_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))
    c += 1
    if c % 1000 == 0:
        print(c / len(X) * 100)
    
data_p = list(zip(X_p, y_p))
random.shuffle(data_p)

train_data = data_p[: int(len(data_p) * 0.9)]
test_data = data_p[int(len(data_p) * 0.9):]

5.107513151846366
10.215026303692731
15.322539455539097
20.430052607385463
25.53756575923183
30.645078911078194
35.75259206292456
40.860105214770925
45.967618366617295
51.07513151846366
56.182644670310026
61.29015782215639
66.39767097400275
71.50518412584913
76.61269727769549
81.72021042954185
86.82772358138823
91.93523673323459
97.04274988508097


### Load Pretrained word vector

In [16]:
import gensim
import gensim.downloader as api

In [17]:
model = api.load("glove-wiki-gigaword-300")

In [18]:
len(model.index2word)

400000

In [19]:
pretrained = []

for key in word2index.keys():
    try:
        pretrained.append(model[word2index[key]])
    except:
        pretrained.append(np.random.randn(300))
        
pretrained_vectors = np.vstack(pretrained)

# Model

In [20]:
class  CNNClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNNClassifier,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernel_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
    
    
    def init_weights(self, pretrained_word_vectors, is_static=False):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1) # (B,1,T,D)
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] #[(N,Co), ...]*len(Ks)

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated) # (N,len(Ks)*Co)
        out = self.fc(concated) 
        return F.log_softmax(out,1)

# Train

In [21]:
EPOCH = 5
BATCH_SIZE = 50
KERNEL_SIZES = [3,4,5]
KERNEL_DIM = 100
LR = 0.001

In [22]:
model = CNNClassifier(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)
model.init_weights(pretrained_vectors) # initialize embedding matrix using pretrained vectors

if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [67]:
ay = lambda l: len(l[0].size()) == 2

In [69]:
goodsie_train_data = list(filter(ay, train_data))

In [70]:
for epoch in range(EPOCH):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, goodsie_train_data)):
        inputs,targets = pad_to_batch(batch)
        
        model.zero_grad()
        preds = model(inputs, True)
        
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist())
        loss.backward()
        
        #for param in model.parameters():
        #    param.grad.data.clamp_(-3, 3)
        
        optimizer.step()
        
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/5] mean_loss : 1.00
[0/5] mean_loss : 0.99
[0/5] mean_loss : 0.96
[0/5] mean_loss : 0.92
[1/5] mean_loss : 0.84
[1/5] mean_loss : 0.89
[1/5] mean_loss : 0.90
[1/5] mean_loss : 0.89
[2/5] mean_loss : 1.02
[2/5] mean_loss : 0.85
[2/5] mean_loss : 0.85
[2/5] mean_loss : 0.83
[3/5] mean_loss : 0.72
[3/5] mean_loss : 0.81
[3/5] mean_loss : 0.81
[3/5] mean_loss : 0.80
[4/5] mean_loss : 0.58
[4/5] mean_loss : 0.79
[4/5] mean_loss : 0.81
[4/5] mean_loss : 0.80


## Test data
Labels for the testing data were not given, so apart from the loss, we will validate the model on the training data. We will use all data samples which have more than 4 words.

In [116]:
test_data = []
for l in train_data:
    if len(l[0][0].size()) == 1:
        if l[0][0].size(0) > 4:
            test_data.append(l)

In [117]:
accuracy = 0
for test in test_data:
    pred = model(test[0]).max(1)[1]
    pred = pred.data.tolist()[0]
    target = test[1].data.tolist()[0][0]
    if pred == target:
        accuracy += 1

print(accuracy/len(test_data) * 100)

72.68489568529361
