# textCNN

In [13]:
import os
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import  torch.nn.functional as F
import pandas as pd
import sys
import random
import re
from tqdm import tqdm
import time
#Set to use GPU to process data, so that the processing speed is faster

tqdm.pandas(desc="progress-bar")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_ROOT = "Datasets"
print(torch.__version__, device)

1.0.1 cuda


  from pandas import Panel


# Max-over-time pooling

In [14]:

class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
         # x shape: (batch_size, channel, seq_len)
        return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)

In [15]:
df=pd.read_csv("ori_data.csv")

In [16]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Sentence ID,HIT ID,Formality,Informativeness,Implicature,Length in Words,Length in Characters,F-score,I-score,Lexical Density,Sentence,Label
0,0,0,3D1TUISJWHZ3X38ICFUZ8CHJC9KIUW,5.8,6.4,4.2,33,201,93.939394,7.242424,66.666667,10In High Bay 4 of the Vehicle Assembly Buildi...,1
1,1,1,3GKAWYFRAOS9XNK03FUU79E7CNKDPT,5.2,5.8,2.6,45,266,85.555556,5.2,62.222222,12The oxygen vent arm and hood removed from th...,1
2,2,2,3VMHWJRYHUFB4G0NGCZ1PM3VQZ8XFB,5.2,5.6,4.8,29,186,93.103448,3.896552,65.517241,"13In the Rotation, Processing and Surge Facili...",1


In [17]:
##Clean the data to remove noise.
def tokenize(strs):
    strs = re.sub(r'[0-9]', ' ', strs)
    strs = strs.strip().lower()
    return strs

In [18]:
####If data cleaning is not needed, comment the next line, and in the next cell, change newSentimentText to SentimentText###########
#df['newSentimentText'] = df['SentimentText'].progress_map(tokenize)

In [19]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Sentence ID,HIT ID,Formality,Informativeness,Implicature,Length in Words,Length in Characters,F-score,I-score,Lexical Density,Sentence,Label
0,0,0,3D1TUISJWHZ3X38ICFUZ8CHJC9KIUW,5.8,6.4,4.2,33,201,93.939394,7.242424,66.666667,10In High Bay 4 of the Vehicle Assembly Buildi...,1
1,1,1,3GKAWYFRAOS9XNK03FUU79E7CNKDPT,5.2,5.8,2.6,45,266,85.555556,5.2,62.222222,12The oxygen vent arm and hood removed from th...,1
2,2,2,3VMHWJRYHUFB4G0NGCZ1PM3VQZ8XFB,5.2,5.6,4.8,29,186,93.103448,3.896552,65.517241,"13In the Rotation, Processing and Surge Facili...",1


In [20]:
haha=[]
for i in range(len(df)):
    haha.append([df.loc[i].Sentence,df.loc[i].Label])

In [21]:
len(haha)

7036

In [22]:
###Get the data and process the data randomly. 0.8 is used for training and 0.2 is used for testing
random.shuffle(haha)
s=int(len(haha)*0.8)

In [23]:
s

5628

In [24]:
train_data=haha[:s]
test_data=haha[s:]

# Split words to generate data dictionary

In [25]:
import torchtext
import collections
from collections import namedtuple

#Word segmentation function
def get_tokenized_imdb(data):
    '''
    @params:
         data: a list of data, each element in the list is a [text string, 0/1 tag] two-tuple
     @return:
         A list of the text after the word segmentation, each element in the list is the word sequence after segmentation
    ''' 
    
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]

#Generate dataset dictionary
def get_vocab_imdb(data):
    '''
   @params:
         data: Same as above
    @return: Dictionary on the dataset, examples of Vocab (freqs, stoi, itos)
    '''
    tokenized_data = get_tokenized_imdb(data)
    #Take out all the tokens in the segmented text and construct a token counter
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    #Pass the constructed counter to the dictionary's constructor and set the minimum word frequency to 5, 
    #so that the dictionary will only contain at least 5 occurrences
    #The real column of the dictionary contains three dictionary variables (freqs (Counter that saves the statistical word frequency), 
    #stoi, itos (stoi, itos together form the bidirectional index of the dictionary)
    return torchtext.vocab.Vocab(counter, min_freq=5)

vocab = get_vocab_imdb(train_data) 

# The text is converted from string form to word subscript sequence form, and the text is processed to make the text length constant

In [26]:

def preprocess_imdb(data, vocab):
    '''
    @params:
        data: Same as above, the original read data
        vocab: Dictionary generated on the training set
    @return:
        features: a sequence of word subscripts, an integer tensor of shape (n, max_l)
        labels: emotional labels, a 0/1 integer tensor with shape (n,)
    '''
    max_l = 500  # Cut each comment or add 0 to make the length 500

    def pad(x):
        #If the length is larger than 500, it will be truncated, if it is less than 500, it will be filled with 0
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
    #After the word segmentation, enumerate each text sequence words,
    #first convert the word to the subscript of the word according to the index of the word, and then normalize the length. If it is less than 500, add 0
    #This gets an integer tensor of shape n * 500, whose content is the subscript sequence of text words
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
   #Similarly quantize the label to obtain a 0/1 integer tensor of shape (n), where n is the number of samples
    labels = torch.tensor([score for _, score in data])
    return features, labels

# Create data iterator


In [27]:
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))


batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))

X torch.Size([64, 500]) y torch.Size([64])
#batches: 88


# textCNNmodel

In [28]:
#Contains two word embedding layers. One participates in training, one does not participate in training
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # Embedding layer not involved in training
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        # The timing maximum pooling layer has no weight, so it can share an instance
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()  # Create multiple one-dimensional convolutional layers
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                                        out_channels = c, 
                                        kernel_size = k))

    def forward(self, inputs):
        # Connect the output of the two embedding layers whose shapes are 
        #(batch size, word count, word vector dimension) according to word vector
        embeddings = torch.cat((
            self.embedding(inputs), 
            self.constant_embedding(inputs)), dim=2) # (batch, seq_len, 2*embed_size)
        embeddings = embeddings.permute(0, 2, 1)
         # For each one-dimensional convolutional layer, after the time sequence maximum pooling, 
         # a shape of (batch size, channel size, 1) will be obtained
         # Tensor. Use the flatten function to remove the last dimension, and then connect on the channel dimension
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # Use the fully connected layer to get the output after applying the discard method
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [29]:
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

# Load pre-trained word vectors

In [30]:
def load_pretrained_embedding(words, pretrained_vocab):
   ###Extract the word vector corresponding to words from the pre-trained vocab###
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # Initialized to 0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed


def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
        
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # If no device is specified, net device is used
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() # Evaluation mode, this will turn off dropout
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() # Change back to training mode
            else: 
                if('is_training' in net.__code__.co_varnames): 
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

# Training word vector

In [31]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

There are 417 oov words.
There are 417 oov words.


# Test Model

In [32]:
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.4813, train acc 0.769, test acc 0.795, time 4.1 sec
epoch 2, loss 0.1867, train acc 0.844, test acc 0.842, time 2.8 sec
epoch 3, loss 0.1043, train acc 0.872, test acc 0.861, time 2.8 sec
epoch 4, loss 0.0645, train acc 0.897, test acc 0.847, time 2.7 sec
epoch 5, loss 0.0413, train acc 0.918, test acc 0.853, time 2.7 sec


In [33]:
def predict_sentiment(net, vocab, sentence):
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'informal' if label.item() == 1 else 'formal'

In [34]:
predict_sentiment(net, vocab, ['but', 'was', 'well', 'below', 'analyst', 'forecasts'])

'informal'

In [35]:
predict_sentiment(net, vocab, ['i', 'am', 'very', 'happy', 'today', 'and', 'i', 'dont','why'])

'informal'