__Loading modules and objects required__

In [1]:
from sklearn.model_selection import train_test_split
from text_utils import (
    TextDataset,
    TextTokenizer, 
    TextProcessor
)
from torch.utils.data import DataLoader
from utils import (
    META_PATH, 
    SEED, 
    TEST_SIZE,
    load_pickle,
    dump_pickle,
    set_seed
)


__Initializing text processor and obtaining IMDB text corpus__

In [2]:
processor = TextProcessor(META_PATH)
corpus, labels = processor.get_corpus()
train_corpus, test_corpus, y_train, y_test = train_test_split(corpus,
                                                              labels,
                                                              test_size=TEST_SIZE,
                                                              random_state=SEED)
 

  0%|          | 0/50000 [00:00<?, ?it/s]

__Initializing text tokenizer and tokenizing text__

In [3]:
tokenizer = TextTokenizer(pad_method="center")
train_tokenized = tokenizer.fit_transform(train_corpus)
test_tokenized = tokenizer.transform(test_corpus)


  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [4]:
dump_pickle(test_corpus, "test_corpus.pkl")
dump_pickle(test_tokenized, "test_tokenized.pkl")
dump_pickle(tokenizer, "tokenizer.pkl")

__Printing several examples of tokenizing and detokenizing text__

In [5]:
test_corpus = load_pickle("test_corpus.pkl")
test_tokenized = load_pickle("test_tokenized.pkl")
tokenizer = load_pickle("tokenizer.pkl")

In [6]:
for sent_orig, sent_tok in zip(test_corpus[:3], test_tokenized[:3]):
    print(" ".join(sent_orig))
    print(sent_tok)
    print(" ".join(tokenizer.get_text_from_vec(sent_tok)))
    print("\n")

what a refreshing change from the pg movies that have teen girls jumping in and out of bed young high school boys counting how many girls they can hook up with kids drinking doing drugs etc . etc . etc . carl hiaasen has written so many books that are enjoyable but hardly classic literature . but he has finally written something that middle school kids want to read . and this movie sends a message to kids that maybe they can make a difference that maybe their voices can be heard . filmed in south florida the scenery is beautiful and natural and real . who cares if its predictable and a little corny . so was free willy and look how well that did . this is a good family movie . . . . . . . . . . a rare breed .
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

__Construct CNN__

STEPS

(0) dataloader

(i) backbone

(ii) train loop

In [7]:
meta = processor.meta
meta_train = meta[meta["train"]==1]
meta_test = meta[meta["train"]==0]

train_set = TextDataset(meta_train,
                        processor,
                        tokenizer)

set_seed(SEED)
train_ldr = DataLoader(train_set,
                       16,
                       shuffle=True,
                       num_workers=0)

for txt, lbl in train_ldr:
    print(txt[0].size())
    print(lbl)
    break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.meta["class"] = self.meta["class"].map(cls_map)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([1, 500])
tensor([0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0])


dataset and dataloader created

next steps

(i) backbone of cnn

(ii) train loop

(iii) quant module


In [9]:
txt.size()

torch.Size([16, 1, 500])

In [11]:
import torch.nn as nn

In [41]:
nn.Embedding??

In [45]:
len(tokenizer.tok2id)

141604

In [53]:
len(tokenizer.id2tok)

9544

In [59]:
txt.unique().size()

torch.Size([951])

In [61]:
txt.long().unique().size()

torch.Size([951])

In [65]:
txt.long().max()

tensor(42036)

In [69]:
len(tokenizer.tok2id)

141604

In [165]:
EMB_DIM = 10
VOCAB_SIZE = len(tokenizer.tok2id)+1
MAX_LEN = 500
POOL_SIZE = 10


inp_lin = []

after_emb = nn.Embedding(VOCAB_SIZE, EMB_DIM)(txt.long())
for i in range(2,6):    
    after_conv = nn.Conv1d(EMB_DIM, 1, i)(after_emb.view(16,MAX_LEN,EMB_DIM).permute(0,2,1))
    after_pool = nn.AvgPool1d(POOL_SIZE)(after_conv)
    inp_lin.append(after_pool)    
    print(i, after_conv.size(), after_pool.size())
    assert after_conv.size()[-1] == MAX_LEN + 1 - i
    assert after_pool.size()[-1] == (MAX_LEN + 1 - i) // POOL_SIZE

2 torch.Size([16, 1, 499]) torch.Size([16, 1, 49])
3 torch.Size([16, 1, 498]) torch.Size([16, 1, 49])
4 torch.Size([16, 1, 497]) torch.Size([16, 1, 49])
5 torch.Size([16, 1, 496]) torch.Size([16, 1, 49])


In [None]:
after_conv_size = 

In [159]:
len(inp_lin)

4

In [160]:
torch.cat(inp_lin, dim=-1).size()

torch.Size([16, 1, 196])

In [161]:
[inp.size() for inp in inp_lin]

[torch.Size([16, 1, 49]),
 torch.Size([16, 1, 49]),
 torch.Size([16, 1, 49]),
 torch.Size([16, 1, 49])]

In [146]:
torch.[inp.size() for inp in inp_lin]

[torch.Size([16, 1, 62]),
 torch.Size([16, 1, 41]),
 torch.Size([16, 1, 31]),
 torch.Size([16, 1, 25])]

In [166]:
sum((1 for i in range(100)))

100

In [182]:
class CustConv_(nn.Conv1d):
    
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size):
        super(CustConv_, self).__init__(in_channels,
                                        out_channels,
                                        kernel_size)
        

In [191]:
class CNN_Text(nn.Module):
    
    def __init__(self,
                 num_classes,
                 max_len,
                 vocab_size,
                 ksize_min,
                 ksize_max,
                 in_channels,
                 out_channels,
                 pool_size,
                 custom_layers={}):
        super(CNN_Text, self).__init__()
        assert ksize_min > 1 and \
                ksize_max > ksize_min and \
                ksize_max < max_len, \
                "kernel size must exceed 1 and be less than maximum length"
        assert isinstance(custom_layers, dict), "custom layers must be a dict"
        self.max_len = max_len
        self.ksize_min = ksize_min
        self.ksize_max = ksize_max
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.pool_size = pool_size
        self.vocab_size = vocab_size
        self.in_linear = sum(((self.max_len + 1 - ksize) // self.pool_size 
                              for ksize in range(self.ksize_min, self.ksize_max+1)))
        self.out_linear = 1 if num_classes == 2 else num_classes
        self.funcs = {
            "avgpool": nn.AvgPool1d,
            "emb": nn.Embedding,
            "conv": nn.Conv1d,
            "lin": nn.Linear,
            "act": nn.Sigmoid
        }
        
        if custom_layers:
            for layer_name, layer in custom_layers.items():
                if self.funcs.get(layer_name):
                    self.funcs[layer_name] = layer
        
        self.layers = {
            "avgpool": self.funcs["avgpool"](self.pool_size),
            "emb": self.funcs["emb"](self.vocab_size+1,
                                self.in_channels),
            "convs": [self.funcs["conv"](self.in_channels,
                                self.out_channels,
                                ksize) for ksize in \
                                range(self.ksize_min, self.ksize_max+1)],
            "lin": self.funcs["lin"](self.in_linear, 
                             self.out_linear),
            "act": self.funcs["act"]()
        }
        
    def forward(self, x):
        # TO DO: calculations
        return x
        
    

In [192]:
test_cnn = CNN_Text(num_classes=2,
         max_len=MAX_LEN,
         vocab_size=VOCAB_SIZE,
         ksize_min=2,
         ksize_max=6,
         in_channels=10,
         out_channels=1,
         pool_size=POOL_SIZE,
         custom_layers={"conv": CustConv_})

In [190]:
test_cnn.layers

{'avgpool': AvgPool1d(kernel_size=(10,), stride=(10,), padding=(0,)),
 'emb': Embedding(141606, 10),
 'convs': [CustConv_(10, 1, kernel_size=(2,), stride=(1,)),
  CustConv_(10, 1, kernel_size=(3,), stride=(1,)),
  CustConv_(10, 1, kernel_size=(4,), stride=(1,)),
  CustConv_(10, 1, kernel_size=(5,), stride=(1,)),
  CustConv_(10, 1, kernel_size=(6,), stride=(1,))],
 'lin': Linear(in_features=245, out_features=1, bias=True),
 'act': Sigmoid()}

In [173]:
test_cnn.layers["emb"]

Embedding(141606, 10)