<a href="https://colab.research.google.com/github/anuraagkansara/SecureNLP/blob/master/code/Bi_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!git clone https://github.com/anuraagkansara/SecureNLP.git
!pip install glove_python

fatal: destination path 'SecureNLP' already exists and is not an empty directory.


In [0]:
import torch
import torch.nn as nn
from torch import optim
import time, random
import os
from tqdm import tqdm
#from lstm import LSTMSentiment
from bilstm import BiLSTMSentiment
from torchtext import data
import numpy as np
import argparse
import codecs


torch.set_num_threads(8)
torch.manual_seed(1)
random.seed(1)

In [4]:
!ls
# %tb
# args = argparse.ArgumentParser()
# args.add_argument('--m', dest='model', default='lstm', help='specify the mode to use (default: lstm)')
# args = args.parse_args()

bilstm.py  dev.tsv  __pycache__  sample_data  SecureNLP  test.tsv  train.tsv


In [0]:
EPOCHS = 8        # 20
USE_GPU = torch.cuda.is_available()
EMBEDDING_DIM = 300
HIDDEN_DIM = 150

BATCH_SIZE = 5
timestamp = str(int(time.time()))
best_dev_acc = 0.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
def get_accuracy(truth, pred):
    assert len(truth) == len(pred)
    right = 0
    for i in range(len(truth)):
        if truth[i] == pred[i]:
            right += 1.0
    return right / len(truth)


def train_epoch_progress(model, train_iter, loss_function, optimizer, text_field, label_field, epoch):
    model.cuda(device)
    loss_function.cuda(device)
    model.train()
    avg_loss = 0.0
    truth_res = []
    pred_res = []
    count = 0
    for batch in tqdm(train_iter, desc='Train epoch '+str(epoch+1)):
        sent, label = batch.text, batch.label
        sent = sent.to(device)
        label = label.to(device)
        label.data.sub_(1)
        truth_res += list(label.data)
        model.batch_size = len(label.data)
        model.hidden = model.init_hidden()
        pred = model(sent)
        pred_label = pred.data.max(1)[1]
        pred_res += [x for x in pred_label]
        model.zero_grad()
        loss = loss_function(pred, label)
        avg_loss += loss.data
        count += 1
        loss.backward()
        optimizer.step()
    avg_loss /= len(train_iter)
    acc = get_accuracy(truth_res, pred_res)
    return avg_loss, acc


def train_epoch(model, train_iter, loss_function, optimizer):
    model.to(device)
    model.train()
    avg_loss = 0.0
    truth_res = []
    pred_res = []
    count = 0
    for batch in train_iter:
        sent, label = batch.text, batch.label
        sent = sent.to(device)
        label = label.to(device)
        label.data.sub_(1)
        truth_res += list(label.data)
        model.batch_size = len(label.data)
        model.hidden = model.init_hidden()
        pred = model(sent)
        pred_label = pred.cpu().data.max(1)[1].numpy()
        pred_res += [x for x in pred_label]
        model.zero_grad()
        loss = loss_function(pred, label)
        avg_loss += loss.data
        count += 1
        loss.backward()
        optimizer.step()
    avg_loss /= len(train_iter)
    acc = get_accuracy(truth_res, pred_res)
    return avg_loss, acc


def evaluate(model, data, loss_function, name):
    model.cuda(device)
    loss_function.cuda(device)
    model.eval()
    avg_loss = 0.0
    truth_res = []
    pred_res = []
    for batch in data:
        sent, label = batch.text, batch.label
        sent = sent.to(device)
        label = label.to(device)
        label.data.sub_(1)
        truth_res += list(label.data)
        model.batch_size = len(label.data)
        model.hidden = model.init_hidden()
        pred = model(sent)
        pred_label = pred.data.max(1)[1]
        pred_res += [x for x in pred_label]
        loss = loss_function(pred, label)
        avg_loss += loss.data
    avg_loss /= len(data)
    acc = get_accuracy(truth_res, pred_res)
    print(name + ': loss %.2f acc %.1f' % (avg_loss, acc*100))
    return acc,pred_res,truth_res

In [0]:

def load_bin_vec(fname, vocab):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_vecs = {}
    with open(fname, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in range(vocab_size):
            word = []
            while True:
                ch = f.read(1).decode('latin-1')
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            if word in vocab:
               word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
            else:
                f.read(binary_len)
    return word_vecs

In [0]:
def load_sst(text_field, label_field, batch_size):
    train, dev, test = data.TabularDataset.splits(path='/content/', train='train.tsv',
                                                  validation='dev.tsv', test='test.tsv', format='tsv',
                                                  fields=[('text', text_field), ('label', label_field)]
                                                  )
    text_field.build_vocab(train, dev, test)
    label_field.build_vocab(train, dev, test)
    # train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
    #             batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), repeat=False, device=-1)
    # for GPU run
    train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
                batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), repeat=False, device=None)
    return train_iter, dev_iter, test_iter

In [0]:
# devoce = torch.device('cuda')
text_field = data.Field(lower=True)
label_field = data.Field(sequential=False)
train_iter, dev_iter, test_iter = load_sst(text_field, label_field, BATCH_SIZE)

In [0]:
 model = BiLSTMSentiment(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\
                          use_gpu=USE_GPU, batch_size=BATCH_SIZE)

In [11]:
if USE_GPU:
    model = model.cuda()
    print("Inside GPU")

Inside GPU


In [0]:
# print('Load word embeddings...')
# # # glove
# # text_field.vocab.load_vectors('glove.6B.100d')

# # word2vector
# word_to_idx = text_field.vocab.stoi
# print(type(word_to_idx))
#pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))
# # pretrained_embeddings[0] = 0
# # word2vec = load_bin_vec('GoogleNews-vectors-negative300.bin', word_to_idx)
# # for word, vector in word2vec.items():
# #     pretrained_embeddings[word_to_idx[word]-1] = vector

In [13]:
!pip install glove_python
from glove import Corpus, Glove



In [0]:
def smote_read_sentences(path, sentence_separator):
    filenames = os.listdir(path)
    result = []
    for filename in filenames:
        with open(path + filename) as file:
            lines = file.read()
        lines = lines.split(sentence_separator)[:-1]
        for line in lines:
            l = []
            for x in line.splitlines():
                w = x.split(' ')
                if w[0] == "''":
                    w[0] = '"'
                w[0] = w[0].lower()
                if len(w) == 2:
                    l.append(np.array(w))
            l = np.array(l)
            result.append(l)
    return result

In [0]:
train = smote_read_sentences("SecureNLP/train/tokenized/", " \n")

In [0]:
embed = [list(x[:, 0]) for x in train]

In [17]:
corpus = Corpus() 
corpus.fit(embed, window=10)
glove = Glove(no_components=300, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [0]:
len(glove.word_vectors[glove.dictionary['malware']])
#emb = glove.word_vectors[glove.dictionary[word]]
pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))
pretrained_embeddings[0]=0
for word,index in glove.dictionary.items():
  pretrained_embeddings[index]=glove.word_vectors[glove.dictionary[word]]

In [0]:
# from google.colab import drive
# drive.mount('/content/drive')

In [20]:
model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
# model.embeddings.weight.data = text_field.vocab.vectors
# model.embeddings.embed.weight.requires_grad = False
best_model = model
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_function = nn.NLLLoss()
next(best_model.parameters()).is_cuda

True

In [21]:
print('Training...')
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

Training...
Writing to /content/runs/1580971258



In [24]:
for epoch in range(EPOCHS):
    print(next(model.parameters()).is_cuda)
    avg_loss, acc = train_epoch_progress(model, train_iter, loss_function, optimizer, text_field, label_field, epoch)
    tqdm.write('Train: loss %.2f acc %.1f' % (avg_loss, acc*100))
    dev_acc,pred_dev,truth_dev = evaluate(model, dev_iter, loss_function, 'Dev')
    if dev_acc > best_dev_acc:
        if best_dev_acc > 0:
            os.system('rm '+ out_dir + '/best_model' + '.pth')
        best_dev_acc = dev_acc
        best_model = model
        torch.save(best_model.state_dict(), out_dir + '/best_model' + '.pth')
        # evaluate on test with the best dev performance model
test_acc,pred_test, truth_test = evaluate(best_model, test_iter, loss_function, 'Test')

  log_probs = F.log_softmax(y)
Train epoch 1:   1%|          | 14/1885 [00:00<00:13, 138.05it/s]

True


Train epoch 1: 100%|██████████| 1885/1885 [00:12<00:00, 155.99it/s]


Train: loss 0.01 acc 99.7


Train epoch 2:   1%|          | 17/1885 [00:00<00:11, 164.93it/s]

Dev: loss 1.01 acc 83.6
True


Train epoch 2: 100%|██████████| 1885/1885 [00:11<00:00, 157.16it/s]


Train: loss 0.00 acc 99.9


Train epoch 3:   1%|          | 16/1885 [00:00<00:12, 152.24it/s]

Dev: loss 1.00 acc 81.5
True


Train epoch 3: 100%|██████████| 1885/1885 [00:11<00:00, 157.41it/s]


Train: loss 0.00 acc 99.9


Train epoch 4:   1%|          | 17/1885 [00:00<00:11, 165.10it/s]

Dev: loss 0.61 acc 87.5
True


Train epoch 4: 100%|██████████| 1885/1885 [00:11<00:00, 157.56it/s]


Train: loss 0.01 acc 99.8


Train epoch 5:   1%|          | 16/1885 [00:00<00:11, 155.84it/s]

Dev: loss 0.94 acc 81.9
True


Train epoch 5: 100%|██████████| 1885/1885 [00:12<00:00, 155.97it/s]


Train: loss 0.00 acc 99.9


Train epoch 6:   1%|          | 16/1885 [00:00<00:11, 156.51it/s]

Dev: loss 1.17 acc 81.8
True


Train epoch 6: 100%|██████████| 1885/1885 [00:12<00:00, 156.45it/s]


Train: loss 0.00 acc 99.9


Train epoch 7:   1%|          | 13/1885 [00:00<00:14, 125.61it/s]

Dev: loss 0.71 acc 80.1
True


Train epoch 7: 100%|██████████| 1885/1885 [00:12<00:00, 153.64it/s]


Train: loss 0.00 acc 99.9


Train epoch 8:   1%|          | 15/1885 [00:00<00:13, 141.02it/s]

Dev: loss 0.99 acc 84.5
True


Train epoch 8: 100%|██████████| 1885/1885 [00:11<00:00, 157.19it/s]


Train: loss 0.00 acc 100.0
Dev: loss 1.28 acc 82.5
Test: loss 2.12 acc 71.7


In [25]:
print(torch.__version__)


1.4.0


In [26]:
print(int(pred_test[0].item()))

1


In [0]:
test=list()
for obj in truth_test:
  test.append(int(obj.item()))
  

In [0]:
file2 = codecs.open("task1.txt","w","utf-8")
pred=list()
for obj in pred_test:
  #sent=sent.encode('utf-8')
  file2.write(str(int(obj.item())))
  pred.append(int(obj.item()))
  file2.write("\n")
file2.close()

In [34]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(test, pred))


              precision    recall  f1-score   support

           0       0.91      0.74      0.82       528
           1       0.28      0.59      0.38        90

    accuracy                           0.72       618
   macro avg       0.60      0.66      0.60       618
weighted avg       0.82      0.72      0.75       618

