In [1]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
nltk.download("all")
nltk.download('punkt')
import matplotlib.pyplot as plt
import torch
%matplotlib inline

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

In [2]:
from google.colab import drive

#get datasets
drive.mount('/content/drive', force_remount=True)

# change the foldername as the project folder location
FOLDERNAME ='/content/drive/MyDrive/CS329P/Project'

# assert FOLDERNAME is not None, "[!] Enter the foldername."

%cd $FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/CS329P/Project


In [3]:
def load_text(path, type):
    """
    Load text data, convert all into lowercase text and save to a list.
    """
    if type == 1:

        with open(path, 'rb') as f:
            texts = []
            for line in f:
                texts.append(line.decode(errors='ignore').lower().strip())
    if type == 2:
        with open(path, 'rb') as f:
            texts = []
            for line in f:
                texts.append(int(line.decode(errors='ignore').lower().strip()))  
    return texts

# Load files
data_text = load_text('data_cnn/asset_index_data.txt', 1)
labels = load_text('data_cnn/asset_index_label.txt',2)
# convert into np array
texts = np.array(data_text)
labels = np.array(labels)

In [4]:
print(texts, labels)

["like many cities in albania, berat comprises an old fortified city filled with churches and mosques painted with grandiose wealth of visible murals and frescos. conversion to islam of the local urban population in berat had increased during this time and part of the newcomer population were also muslim converts who had islamic names and christian surnames.. factors such as tax exemptions for muslim urban craftsmen in exchange for military service drove many of the incoming rural first generation muslim population to berat.. in the modern period, a romani community numbering 200-300 lives in berat and its outskirts whereas others in a few nearby villages, at times living in difficult economic circumstances with some seasonally migrating to greece for work. economy   by the 18th century the economy and society of berat was closely connected to the city's craft guilds partly related to various tax exemptions that existed since the late middle ages. like many cities in albania, berat com

Let's use pretrained word vectors(FastText Word Vectors, Mikolov et al., 2017) to train the tokens

In [5]:
%%time
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE



fastText exists.
CPU times: user 1.09 ms, sys: 17 µs, total: 1.11 ms
Wall time: 730 µs


In [6]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


# Tokenizing

Not sure whether we need this process or not - Cara already made some codes for this

In [7]:

from tqdm import tqdm_notebook
from collections import defaultdict
from nltk.tokenize import word_tokenize
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
from sklearn.model_selection import train_test_split


def tokenize(texts):
    """
    1. Tokenize
    2. build vocabulary
    3. save maximum sentence length 
    Output:
        tokenized 
        word2idx (Dict): Vocabulary built from the corpus
        max_len 
    """
    max_len = 0
    tokenized_texts = []
    word2idx = {}

    # Add <pad> and <unk> tokens to the vocabulary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    # Building our vocab from the corpus starting from index 2
    idx = 2
    for sent in texts:
        tokenized_sent = word_tokenize(sent)

        # Add `tokenized_sent` to `tokenized_texts`
        tokenized_texts.append(tokenized_sent)

        # Add new token to `word2idx`
        for token in tokenized_sent:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1

        # Update `max_len`
        max_len = max(max_len, len(tokenized_sent))

    return tokenized_texts, word2idx, max_len

def encode(tokenized_texts, word2idx, max_len):
    """
    First, we need to pad each sentence to the number of maximum sentence length
    Next, encode tokens to the index 

    Output: Input for the model - array of token indexes in the vocabulary, (N, max_len)
    """

    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(token) for token in tokenized_sent]
        input_ids.append(input_id)
    
    return np.array(input_ids)

def load_pretrained_vectors(word2idx, fname):
    """
    Input:
        word2idx (Dict): Vocabulary built from the corpus
        fname: Path to pretrained vector file

    Output:
        embeddings (np.array): Embedding matrix with shape (N, d) where N is
            the size of word2idx and d is embedding dimension
    """

    print("Loading pretrained vectors...")
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())


    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))

    # Load pretrained vectors
    count = 0
    for line in tqdm_notebook(fin):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")

    return embeddings

In [8]:
import nltk
nltk.download('punkt')

################ Tokenize
tokenized_texts, word2idx, max_len = tokenize(texts)
input_ids = encode(tokenized_texts, word2idx, max_len)

# Pretrained vectors
embeddings = load_pretrained_vectors(word2idx, "fastText/crawl-300d-2M.vec")
embeddings = torch.tensor(embeddings)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

There are 15131 / 20776 pretrained vectors found.


In [9]:

def data_loader(train_inputs, val_inputs, train_labels, val_labels,
                batch_size=50):

    train_inputs, val_inputs, train_labels, val_labels =\
    tuple(torch.tensor(data) for data in
          [train_inputs, val_inputs, train_labels, val_labels])

    ############ batch
    batch_size = 50

    # DataLoader for training data
    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # DataLoader for validation data
    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

In [10]:
# Train Test Split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels, test_size=0.1, random_state=42)

# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import time

# Sample configuration:
filter_sizes = [2, 3, 4]
num_filters = [2, 2, 2]


# 1D CNN
class CNN_NLP(nn.Module):
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
            pretrained_embedding: (vocab_size, embed_dim)

            When pretrained word embeddings are not used
                vocab_size,  embed_dim
            n_classes: for now, we simplified the class as two (idx = 1 or idx = 2, because higher index has small # of dataset)
            dropout rate
        """

        super(CNN_NLP, self).__init__()
        
        
        # Embedding layer when we use pretrained model 
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):


        #  (b, max_len, embed_dim)
        embed_frominput = self.embedding(input_ids).float()

        #  (b, embed_dim, max_len)
        embed_reshaped = embed_frominput.permute(0, 2, 1)

        # CNN & ReLU: (b, num_filters[i], L_out)
        embed_conv_list = [F.relu(conv1d(embed_reshaped)) for conv1d in self.conv1d_list]

        # Max pool: (b, num_filters[i], 1)
        embed_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in embed_conv_list]
        
        # FC- (b, sum(num_filters))
        embed_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in embed_pool_list],
                         dim=1)
        
        # (b, n_classes)
        output = self.fc(self.dropout(embed_fc))

        return output

In [12]:
def initialize_model(pretrained_embedding=None, freeze_embedding=False,
                    vocab_size=None,  embed_dim=300,
                    filter_sizes=[3, 4, 5],
                    num_filters=[100, 100, 100],
                    num_classes=2,
                    dropout=0.5,
                    learning_rate=0.01):

    assert (len(filter_sizes) == len(num_filters)), "filter_sizes and num_filters are not the same length!"

    cnn_model = CNN_NLP(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=freeze_embedding,
                        vocab_size=vocab_size,
                        embed_dim=embed_dim,
                        filter_sizes=filter_sizes,
                        num_filters=num_filters,
                        num_classes=2,
                        dropout=0.5)
    
    cnn_model.to(device)

    # since it is adam, maybe we need to decrease lr into 0.0001?
    optimizer = optim.Adam(cnn_model.parameters(),
                               lr=learning_rate)

    return cnn_model, optimizer


loss_fn = nn.CrossEntropyLoss()

# for reproducability, set random seed
def set_seed(seed_value=32):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)


def train(model, optimizer, train_dataloader, val_dataloader=None, epochs=10):
    """Train the CNN model."""
    
    # Tracking best validation accuracy
    best_accuracy = 0

    print("Start training!\n")

    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {\
    'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*60)

    for epoch_i in range(epochs):


        ################## Training ##############

        t0_epoch = time.time()
        total_loss = 0

        # training mode
        model.train()

        for step, batch in enumerate(train_dataloader):
            # Load batch to GPU
            tr_input_ids, tr_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()
            pred_grad = model(tr_input_ids)

            loss = loss_fn(pred_grad, tr_labels)
            total_loss += loss.item()

            loss.backward()

            optimizer.step()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        ################## Evaluation #######################

        if val_dataloader is not None:

            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # best accuracy
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy

            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {\
            val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            
    print("\n")
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's
    performance on our validation set.
    """

    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        ev_input_ids, ev_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(ev_input_ids)

        loss = loss_fn(logits, ev_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        # Accuracy rate from mean
        accuracy = (preds == ev_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [13]:
# are randomly initialized word vectors
cnn_rand, optimizer = initialize_model(vocab_size=len(word2idx),
                                      embed_dim=300,
                                      learning_rate=0.25,
                                      dropout=0.5)
train(cnn_rand, optimizer, train_dataloader, val_dataloader, epochs=20)

Start training!

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |  91.785452   | 172.627165 |   55.71   |   3.26   
   2    |  339.568625  | 254.412391 |   80.00   |   2.92   
   3    |  255.369074  | 302.818338 |   75.14   |   2.91   
   4    |  321.280829  | 498.296321 |   78.86   |   2.91   
   5    |  330.805550  | 213.505090 |   77.43   |   2.91   
   6    |  276.886445  | 413.625556 |   74.00   |   2.91   
   7    |  227.664535  | 294.670962 |   63.14   |   2.92   
   8    |  208.666763  | 291.935737 |   60.00   |   2.91   
   9    |  209.746330  | 505.166526 |   66.57   |   2.91   
  10    |  207.079414  | 422.051998 |   74.57   |   2.91   
  11    |  171.305536  | 389.187895 |   80.86   |   2.92   
  12    |  120.284912  | 345.973269 |   78.29   |   2.91   
  13    |  85.493789   | 290.588449 |   72.00   |   2.92   
  14    |  56.585863   | 314.023468 |   77.14   |   2.91   
  15    |  65.184303  

In [14]:
# freezed pretrained word vectors during training
set_seed(28)
cnn_static, optimizer = initialize_model(pretrained_embedding=embeddings,
                                        freeze_embedding=True,
                                        learning_rate=0.25,
                                        dropout=0.5)
train(cnn_static, optimizer, train_dataloader, val_dataloader, epochs=20)

Start training!

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |  25.289047   | 16.177812  |   80.00   |   2.23   
   2    |  17.198512   | 13.794619  |   71.71   |   2.23   
   3    |  15.973351   | 13.783783  |   74.57   |   2.23   
   4    |  16.085446   | 15.099487  |   68.57   |   2.23   
   5    |  19.292009   | 13.183061  |   77.43   |   2.23   
   6    |  14.438902   | 15.779487  |   75.43   |   2.23   
   7    |  11.495174   | 19.167115  |   72.29   |   2.23   
   8    |  19.777563   | 12.532807  |   74.29   |   2.23   
   9    |   9.793668   | 16.077964  |   76.86   |   2.23   
  10    |   5.951348   | 10.447345  |   75.71   |   2.23   
  11    |   5.084112   | 14.033642  |   77.71   |   2.23   
  12    |   4.019488   | 19.637546  |   78.86   |   2.23   
  13    |   4.493736   | 15.647094  |   73.71   |   2.23   
  14    |   3.936853   | 14.716355  |   78.00   |   2.23   
  15    |   2.415392  

In [15]:
# fine-tuned pretrained word vectors during training 
set_seed(42)
cnn_non_static, optimizer = initialize_model(pretrained_embedding=embeddings,
                                            freeze_embedding=False,
                                            learning_rate=0.25,
                                            dropout=0.5)
train(cnn_non_static, optimizer, train_dataloader, val_dataloader, epochs=20)


Start training!

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |  491.236342  | 1335.325431 |   79.14   |   3.04   
   2    | 1804.609296  | 3654.072562 |   79.14   |   3.04   
   3    | 3864.533686  | 3404.061663 |   66.86   |   3.04   
   4    | 1908.577060  | 6753.296193 |   79.43   |   3.03   
   5    | 1572.432608  | 5696.997907 |   78.57   |   3.04   
   6    | 1606.129302  | 7033.569196 |   74.00   |   3.04   
   7    | 1152.709840  | 8630.073661 |   68.86   |   3.04   
   8    |  807.369789  | 8493.697998 |   74.86   |   3.03   
   9    |  454.371077  | 17741.472726 |   78.86   |   3.04   
  10    |  345.248887  | 14051.659681 |   78.00   |   3.04   
  11    |  272.562099  | 14266.424953 |   78.57   |   3.04   
  12    |  221.431018  | 15682.693045 |   78.00   |   3.03   
  13    | 1041.820661  | 38128.037319 |   76.00   |   3.04   
  14    | 1395.069425  | 38855.609515 |   78.29   |   3.04   
  

In [16]:
def predict(text, model=cnn_non_static.to("cpu"), max_len=62):
    """Predict probability based on the trained model."""

    # Tokenize, pad and encode text
    tokens = word_tokenize(text.lower())
    padded_tokens = tokens + ['<pad>'] * (max_len - len(tokens))
    input_id = [word2idx.get(token, word2idx['<unk>']) for token in padded_tokens]

    # Convert to PyTorch tensors
    input_id = torch.tensor(input_id).unsqueeze(dim=0)

    # Compute logits
    predicted = model.forward(input_id)

    probs = F.softmax(predicted, dim=1).squeeze(dim=0)

    print(f"This sentence has asset index as {probs[1] }")

In [17]:
predict("The lower dam will regulate outflows from the 횉etin main dam and also produce hydroelectric power with a 112 MW capacity via two 56 MW Kaplan turbines.  .")

This sentence has asset index as 0.0
