In [1]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path
        
class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = read(f, 4)


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
          for cell in nb.cells:
            if cell.cell_type == 'code':
                # transform the input to executable Python
                code = self.shell.input_transformer_manager.transform_cell(cell.source)
                # run the code in themodule
                exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod
    
class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]
    
sys.meta_path.append(NotebookFinder())

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable   
from torch import optim

sys.path.append("../Preprocessor")
import format_module

import rnn
import naivebayesian
import cnn

importing Jupyter notebook from rnn.ipynb


In [3]:
import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    with open(filename, 'rb') as input:
        return pickle.load(input)

In [4]:
class classifierModule(nn.Module):
    def __init__(self, input_size, batch_size):
        super(classifierModule, self).__init__()
        self.batch_size = batch_size
        self.input_size = input_size
        
        self.rnn_model = rnn.RNN_model(input_size)
        self.rnn_out_size = rnn.hidden_size
        #self.rnn_mlp()
        
        self.nb_model = naivebayesian.NaiveBayesianDB()
        
        self.cnn_model = cnn.ConvNet(input_size)
        #self.cnn_model = 
        
        #TODO
        
    def encoder(self, formattedList):
        length = len(formattedList)
        contextList = [formattedList[i].context for i in range(length)]

        lengths = torch.LongTensor([len(contextList[i]) for i in range(length)])
        max_len = torch.max(lengths)
        
        data = np.zeros((length, max_len, self.input_size))

        for i in range(length):
            context = contextList[i]
            if not (context.size == 0):
                data[i, :context.shape[0],:] = context
            else:
                lengths[i] = 1
            i+=1
            
        return self.sort_batch(torch.FloatTensor(data), formattedList, lengths)
        
    def sort_batch(self, context, formatted, seq_len):
        batch_size = context.size(0)
        sorted_seq_len, sorted_idx = seq_len.sort(0, descending = True)
        
        sorted_context = context[sorted_idx]
        sorted_formatted = [formatted[i] for i in sorted_idx]

        for f in sorted_formatted:
            print(len(f.context))
        
        return Variable(sorted_context), sorted_formatted, sorted_seq_len
    
    def resize_input(self, input):
        list_ = list()
        for i in range(0, len(input), self.batch_size):
            list_.append(input[i:i+self.batch_size])
        return list_
        
    def forward(self, formatted_list, hidden=None):
        context, formatted, lengths = self.encoder(bl)
        
        rnn_out = self.rnn_model(context, lengths)
        cnn_out = self.cnn_model(context)
        nb_out = self.nb_model.naive_bayes_FRlist(formatted)
        
        print("rnn_out : ", rnn_out, "\n\n\n")
        print("cnn_out : ", cnn_out, "\n\n\n")
        print("nb_out : ", nb_out, "\n\n\n")

In [5]:
formatted_list = load_object("../Preprocessor/save_formatted_review.pkl")
test_classifier = classifierModule(100, 10)

test_classifier.nb_model.add_FRlist(formatted_list)

batch_list = test_classifier.resize_input(formatted_list)
for bl in batch_list:
    test_classifier(bl)

for param in test_classifier.parameters():
     print(type(param.data), param.size())

137
120
52
42
34
32
27
25
20
9
torch.Size([1370, 1])
torch.Size([10, 137])
[[ 0.          1.          0.         ...,  0.          0.          0.        ]
 [ 0.          1.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          1.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          1.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.71666667]
 [ 0.          1.          0.         ...,  0.          0.          0.        ]]

 0.0000  1.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  1.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  1.0000  ...   0.0000  0.0000  0.0000
          ...             ⋱             ...          
 0.0000  1.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.7167
 0.0000  1.0000  0.0000  ...   0.0000  0.0000  0.0000
[torch.DoubleTensor of size 137x60]

torch.Size([10, 60])


# Training

### hyperparameters :

1. learning_rate
2. input_size
3. rnn_output_size, cnn_output_size
4. batch_size
5. optimizer
6. loss function
7. n_epochs

In [7]:
learning_rate = 0.001
input_size = 100  # word2vec k size
batch_size = 100
n_epochs = 100

model = classifierModule(input_size, batch_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
# target이 0일 때, p가 1-s보다 작으면 +1
# target이 1일 때, p가 1-s보다 크면 +1
# -> (1-s-p)*(t-1/2) <= 0 일 때 +1
def get_accuracy(outputs, targets, sensitivity):
    result = 0
    t = targets.data-0.5
    x = (1-sensitivity-outputs.data[:, 1])*t
    for y in x:
        if y <= 0:
            result+=1
    return result
    
def get_targets(batch):
    targets = list()
    for formatted in batch:
        if formatted.label:
            targets.append(1)
        else:
            targets.append(0)
            
    return Variable(torch.tensor(targets), requires_grad = False)

def get_prediction(outputs, sensitivity):
    return np.ceil(outputs.data[:, 1]+sensitivity-1)

In [16]:
b= [[1,0],[0.9, 0.1], [0.6, 0.4], [0.5, 0.5], [0.2, 0.8]]
a= Variable(torch.FloatTensor(b))
get_prediction(a, 0.2)


-0
-0
 0
 1
 1
[torch.FloatTensor of size 5]

In [9]:
def train_net(self, train_list, validation_list, sensitivity = 0.5):
    batch_list = test_classifier.resize_input(train_list)
    
    tacc_list = list()
    vacc_list = list()
    
    for epoch in range(n_epochs):
        tacc_list.append(0)
        vacc_list.append(0)
        
        for bl in batch_list:
            outputs = model(bl)
            targets = get_targets(bl)

            optimizer.zero_grad()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            tacc_list[-1] += get_accuracy(outputs, targets, sensitivity)
            
        tacc_list[-1] = tacc_list[-1] / len(train_list)
        vacc_list[-1] = get_accuracy(model(validation_list), get_targets(validation_list), sensitivity) / len(validation_list)
    
        print("epoch {}: loss.data[0] {:.3f}|  train acc {:.3f}|  validation acc {:.3f}" 
              .format(epoch, loss.data[0], tacc_list[-1], vacc_list[-1]))
        
        if epoch > 5 and np.mean(np.array(tacc_list[-6:-1])) < np.mean(np.array(vacc_list[-6:-1])):
            print("Seems like m1 starts to overfit, aborting training")
            break
            
    print("Finished Training")

In [37]:
a = ["ㄱ", "ㄴ", "ㄷ", "ㄹ", "ㅁ"]
b = [4, 3, 2, 1, 0]
c = [4, 2, 1, 0, 3]

d = [x for _,x in sorted(zip(c,a))]
print(d)
list.reverse(d)
print(d)

['ㄹ', 'ㄷ', 'ㄴ', 'ㅁ', 'ㄱ']
['ㄱ', 'ㅁ', 'ㄴ', 'ㄷ', 'ㄹ']


In [10]:
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
a_ = list()
for i in range(0, len(a), 3):
    a_.append(a[i:i+3])
print(a_)

[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]]
