In [1]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path
        
class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = read(f, 4)


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
          for cell in nb.cells:
            if cell.cell_type == 'code':
                # transform the input to executable Python
                code = self.shell.input_transformer_manager.transform_cell(cell.source)
                # run the code in themodule
                exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod
    
class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]
    
sys.meta_path.append(NotebookFinder())

In [2]:
!pip install pydblite



In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable   
from torch import optim
import numpy as np
import time

sys.path.append("../Preprocessor")
import format_module
reviewDB = format_module.FormattedReview.reviewDB

import rnn
import naivebayesian
import cnn
import conclude
import mlp

reviewDB = format_module.ReviewDB("../Preprocessor/pkl/train_context")
format_module.FormattedReview.setDB(reviewDB)

importing Jupyter notebook from rnn.ipynb


In [4]:
import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    with open(filename, 'rb') as input:
        return pickle.load(input)

In [5]:
class classifierModule(nn.Module):
    def __init__(self, input_size, batch_size, FRlist, path, refresh = False):
        super(classifierModule, self).__init__()
        self.batch_size = batch_size
        self.input_size = input_size
        
        self.rnn_model = rnn.RNN_model(input_size)
        self.rnn_out_size = rnn.RNN_model.hidden_size
        self.rnn_mlp = mlp.mlp(format_module.FormattedReview.attribute_num, self.rnn_out_size)
        
        self.nb_model = naivebayesian.NaiveBayesianDB()
        self.nb_out_size = 1
        self.nb_mlp = mlp.mlp(format_module.FormattedReview.attribute_num, self.nb_out_size)
        
        self.cnn_model = cnn.ConvNet(input_size)
        self.cnn_out_size = cnn.ConvNet.output_vector_size
        self.cnn_mlp = mlp.mlp(format_module.FormattedReview.attribute_num, self.cnn_out_size)
        
        self.conclude = conclude.conclude()
        
        self.nb_model.add_FRlist(FRlist) #initialize nb database
        
        self.path = path
        
        if os.path.exists(path) and not refresh:
            self.load_state_dict(torch.load(path))
            
        else:
            self.save_state_dict()
        
    def save_state_dict(self):
        torch.save(self.state_dict(), self.path)
        
    def encoder(self, formattedList):
        length = len(formattedList)
        contextList = [formattedList[i].context for i in range(length)]

        lengths = torch.LongTensor([len(contextList[i]) for i in range(length)])
        max_len = torch.max(lengths)
        
        data = np.zeros((length, max_len, self.input_size))

        for i in range(length):
            context = contextList[i]
            if not (context.size == 0):
                data[i, :context.shape[0],:] = context
            else:
                lengths[i] = 1
            
        return self.sort_batch(torch.FloatTensor(data), formattedList, lengths)
        
    def sort_batch(self, context, formatted, seq_len):
        batch_size = context.size(0)
        sorted_seq_len, sorted_idx = seq_len.sort(0, descending = True)
        
        sorted_context = context[sorted_idx]
        sorted_formatted = [formatted[i] for i in sorted_idx]
        
        return Variable(sorted_context), sorted_formatted, sorted_seq_len
    
    def resize_input(self, input):
        list_ = list()
        for i in range(0, len(input), self.batch_size):
            list_.append(input[i:i+self.batch_size])
        return list_
        
    def forward(self, formatted_list, hidden=None, mode = "Default"):
        context, formatted, lengths = self.encoder(formatted_list)
        
        if mode == "rnn":
            rnn_out = self.rnn_model(context, lengths)
            rnn_mlp_out = self.rnn_mlp(self.rnn_mlp.getdata(formatted, rnn_out))
            output_0to1 = torch.nn.functional.sigmoid(rnn_mlp_out)
            return torch.cat([1- output_0to1, output_0to1],1)
            
        elif mode == "cnn":
            cnn_out = self.cnn_model(context)
            cnn_mlp_out = self.cnn_mlp(self.cnn_mlp.getdata(formatted, cnn_out))
            output_0to1 = torch.nn.functional.sigmoid(cnn_mlp_out)
            return torch.cat([1- output_0to1, output_0to1],1)
            
        elif mode == "nb":
            nb_out = self.nb_model.naive_bayes_FRlist(formatted)
            nb_mlp_out = self.nb_mlp(self.nb_mlp.getdata(formatted, nb_out))
            output_0to1 = torch.nn.functional.sigmoid(nb_mlp_out)
            return torch.cat([1- output_0to1, output_0to1],1)
            
        else:
            rnn_out = self.rnn_model(context, lengths)
            cnn_out = self.cnn_model(context)
            nb_out = self.nb_model.naive_bayes_FRlist(formatted)
            
            rnn_mlp_out = self.rnn_mlp(self.rnn_mlp.getdata(formatted, rnn_out))
            cnn_mlp_out = self.cnn_mlp(self.cnn_mlp.getdata(formatted, cnn_out))
            nb_mlp_out = self.nb_mlp(self.nb_mlp.getdata(formatted, nb_out))
            
            return self.conclude(self.conclude.bind(rnn_mlp_out, cnn_mlp_out, nb_mlp_out))
        
        '''
        print("rnn_out : ", rnn_out.size())
        print("cnn_out : ", cnn_out.size())
        print("nb_out : ", nb_out.size(), "\n")
        '''
        
    def print_contribution(self):
        (weight, bias) = self.conclude.get_contribution()
        print("----------------- Current model contribution ----------------")
        print("-- rnn : ", weight.data[0][0])
        print("-- cnn : ", weight.data[0][1])
        print("-- nb : ", weight.data[0][2])
        print("-- bias : ", bias.data[0])
        print("-------------------------------------------------------------\n")

In [6]:
formatted_list = load_object("../Preprocessor/pkl/save_formatted_review_validation.pkl")
test_classifier = classifierModule(100, 100, formatted_list, "./models/asdf.mdl", True)

batch_list = test_classifier.resize_input(formatted_list)
#for bl in batch_list:
#    print(test_classifier(bl, 'rnn'))

test_classifier.print_contribution()
'''
for param in test_classifier.parameters():
     print(type(param.data), param.size())
'''

----------------- Current model contribution ----------------
-- rnn :  0.33333298563957214
-- cnn :  0.33333298563957214
-- nb :  0.33333298563957214
-- bias :  
 0
[torch.FloatTensor of size 1]

-------------------------------------------------------------



'\nfor param in test_classifier.parameters():\n     print(type(param.data), param.size())\n'

In [7]:
for fl in formatted_list:
    print(fl.get_attribute())

[ 0.52461901  1.          0.          0.          0.98584404  0.57142857
  0.        ]
[ 0.44409099  0.9         0.          0.          0.96203704  0.71428571
  0.        ]
[ 0.38268208  1.          0.          0.          0.8593285   0.          0.        ]
[ 0.99999996  1.          0.          0.          0.58349679  0.71428571
  0.        ]
[ 0.37101286  0.2         0.          0.          0.60976429  0.42857143
  0.        ]
[ 0.63866427  0.8         0.          0.          0.68537785  0.71428571
  0.        ]
[ 1.00000009  1.          0.          0.          0.70232824  0.          0.        ]
[ 0.36597644  1.          0.          0.          0.69675078  0.          0.        ]
[ 0.63081987  1.          0.          0.          0.5745995   0.14285714
  0.        ]
[ 0.31171294  1.          0.          0.          0.89258102  0.85714286
  0.        ]
[ 0.37099845  1.          0.          0.          0.68424167  0.42857143
  0.        ]
[ 0.99999997  1.          0.          0.      

[ 1.00000005  1.          0.          0.          0.96523241  0.71428571
  0.        ]
[ 0.45625271  0.2         0.          0.          0.65314125  0.57142857
  0.        ]
[ 0.56964927  1.          0.          0.          0.63951826  0.28571429
  0.        ]
[ 0.73063402  1.          0.          0.          0.06922906  0.57142857
  0.        ]
[ 1.00000009  1.          0.          0.          0.70252412  0.42857143
  0.        ]
[ 0.31219033  1.          0.          0.          0.40960331  0.57142857
  0.        ]
[ 0.99999996  1.          0.          0.          0.79274372  0.57142857
  0.        ]
[ 1.00000003  1.          0.          0.          0.84255507  0.85714286
  0.        ]
[ 0.39877411  1.          0.          0.          0.29318693  0.42857143
  0.        ]
[ 0.42512628  1.          0.          0.          0.88049769  0.          0.        ]
[ 0.45181832  0.4         0.          0.          0.61056289  0.85714286
  0.        ]
[ 0.33414736  0.2         0.          0.    

  0.        ]
[ 0.99999994  0.          0.          0.          0.03939815  0.85714286
  0.        ]
[-1.          0.9         0.          0.          0.50355602  0.28571429
  0.        ]
[ 1.00000003  1.          0.          0.          0.7254116   0.          0.        ]
[ 0.23836642  1.          0.          0.          0.91086411  0.14285714
  0.        ]
[ 0.46612759  1.          0.          0.          0.81911185  0.85714286
  0.        ]
[ 0.43230409  0.2         0.          0.          0.83715206  0.85714286
  0.        ]
[ 0.25505906  0.2         0.          0.          0.61554313  0.57142857
  0.        ]
[ 0.73063402  0.2         0.          0.          0.41893796  0.42857143
  0.        ]
[ 0.67145983  0.1         0.          0.          0.16351303  0.28571429
  0.        ]
[ 0.99999997  0.9         0.          0.          0.78523841  0.42857143
  0.        ]
[ 0.20863423  1.          0.          0.          0.48386785  0.42857143
  0.        ]
[ 1.00000009  1.          0.  

# Training

### hyperparameters :

1. learning_rate
2. input_size
3. rnn_output_size, cnn_output_size
4. batch_size
5. optimizer
6. loss function
7. n_epochs

In [7]:
learning_rate = 0.002
input_size = 100  # word2vec k size
batch_size = 100
n_epochs = 100

reviewDB = format_module.ReviewDB("../Preprocessor/pkl/train")
format_module.FormattedReview.setDB(reviewDB)
FRlist = load_object("../Preprocessor/pkl/save_formatted_review_train.pkl")[:100]
model = classifierModule(input_size, batch_size, FRlist, "./models/loss_1.mdl", True)
criterion = nn.CrossEntropyLoss(torch.FloatTensor([1,6]))
#criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [8]:
# target이 0일 때, p가 1-s보다 작으면 +1
# target이 1일 때, p가 1-s보다 크면 +1
# -> (1-s-p)*(t-1/2) <= 0 일 때 +1
def get_accuracy(outputs, targets, sensitivity):
    result = 0
    tp, tn, fp, fn = 0, 0, 0, 0
    t = targets.data.type(torch.FloatTensor)-0.5
    x = (1-sensitivity-outputs.data[:, 1])*t
    i = 0
    for y in x:
        if y < 0:
            result+=1
            if t[i] > 0: tp += 1
            else: tn += 1
        else:
            if t[i] > 0: fn += 1
            else: fp += 1
        i += 1
    return np.array([result, tp, tn, fp, fn])
    
def get_targets(input, model, out = [1, 0]):
    _, batch, _ = model.encoder(input)
    targets = list()
    for formatted in batch:
        if formatted.label:
            targets.append(out[0])
        else:
            targets.append(out[1])
    
    return Variable(torch.LongTensor(targets), requires_grad = False)

def get_prediction(outputs, sensitivity):
    return np.ceil(outputs.data[:, 1]+sensitivity-1+0.000000001)

In [None]:
def train_net(train_list, validation_list, sensitivity = 0.5, run_mode = "default"):
    batch_list = model.resize_input(train_list)
    
    for input in train_list:
        reviewDB.add_spam_result(input.bookingReview.id, input.label)
    
    for epoch in range(n_epochs):
        time_1 = time.time()
        
        tacc = np.array([0, 0, 0, 0, 0])
        vacc = np.array([0, 0, 0, 0, 0])

        for bl in batch_list:
            outputs = model(bl, mode = run_mode)
            targets = get_targets(bl, model)

            optimizer.zero_grad()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            tacc += get_accuracy(outputs, targets, sensitivity)

        tacc = tacc / len(train_list)
        
        time_2 = time.time()
        
        v_outputs = model(validation_list, mode = run_mode)
        v_targets = get_targets(validation_list, model)
        vacc = get_accuracy(v_outputs, v_targets, sensitivity) / len(validation_list)
        v_loss = criterion(v_outputs, v_targets)
    
        '''
        print("epoch{:>3}: train acc {:.6f} | validation acc {:.6f}" 
              .format(epoch, tacc_list[-1][0], vacc_list[-1][0]))
        print("----- train ham acc {:.6f} | train spam acc {:.6f}"
              .format( tacc_list[-1][2]/(tacc_list[-1][2]+tacc_list[-1][3]), tacc_list[-1][1]/(tacc_list[-1][1]+tacc_list[-1][4])))
        print("-validation ham acc {:.6f} | vldtn spam acc {:.6f}"
              .format( vacc_list[-1][2]/(vacc_list[-1][2]+vacc_list[-1][3]), vacc_list[-1][1]/(vacc_list[-1][1]+vacc_list[-1][4])))
        print("------- loss.data   {:.6f} | v_loss.data    {:.6f}\n"
              .format(loss.data[0], v_loss.data[0]))
        '''
        
        f1_train = 0.0
        f1_val = 0.0
        if tacc[1] != 0: f1_train = 2/( 1 + ((1-tacc[2])/tacc[1]) )
        if vacc[1] != 0: f1_val = 2/( 1 + ((1-vacc[2])/vacc[1]) )
        
        print("epoch{:>3}: {} s" .format(epoch, time_2-time_1) )
        print("--------- train_acc {:.6f} | ham_acc {:.6f} | spam_acc {:.6f} | f1_score {:.6f} | loss.data {:.6f}"
             .format(tacc[0], tacc[2]/(tacc[2]+tacc[3]), tacc[1]/(tacc[1]+tacc[4]),
                     2/( 1 + ((1-tacc[2])/tacc[1]) ), loss.data[0]) )
        print("---- validation_acc {:.6f} | ham_acc {:.6f} | spam_acc {:.6f} | f1_score {:.6f} | loss.data {:.6f}"
             .format(vacc[0], vacc[2]/(vacc[2]+vacc[3]), vacc[1]/(vacc[1]+vacc[4]),
                     2/( 1 + ((1-vacc[2])/vacc[1]) ), v_loss.data[0]) )
        
        #if epoch > 5 and np.mean(np.array(tacc_list[-6:-1])) < np.mean(np.array(vacc_list[-6:-1])):
        #    print("Seems like m1 starts to overfit, aborting training")
        #    break
        model.save_state_dict()
            
    print("Finished Training")

In [None]:
train_list = load_object("../Preprocessor/pkl/save_formatted_review_train.pkl")
validation_list = load_object("../Preprocessor/pkl/save_formatted_review_validation.pkl")
train_net(train_list, validation_list)

In [None]:
test_list = load_object("../Preprocessor/pkl/save_formatted_review_test.pkl")
out = model(test_list, mode = 'default')
tg = get_targets(test_list, model)
#_, encoded, _ = model.encoder(test_list)

print(get_accuracy(out, tg, 0.5)/len(test_list))
for i in range(len(test_list)):
    acc = get_accuracy(out[i:i+1], tg[i:i+1], 0.5)
    if tg.data[i]:
        print("            ",out.data[i, 1], tg.data[i], acc)
    else:
        print(out.data[i, 1], tg.data[i], acc)

## Code from
https://github.com/szagoruyko/functional-zoo/blob/master/visualize.py

In [15]:
!pip install graphviz

Collecting graphviz
  Downloading graphviz-0.8.1-py2.py3-none-any.whl
Installing collected packages: graphviz
Successfully installed graphviz-0.8.1


In [21]:
from graphviz import Digraph
import torch
from torch.autograd import Variable
import sys
sys.setrecursionlimit(10000)

def make_dot(var, params=None):
    """ Produces Graphviz representation of PyTorch autograd graph
    Blue nodes are the Variables that require grad, orange are Tensors
    saved for backward in torch.autograd.Function
    Args:
        var: output Variable
        params: dict of (name, Variable) to add names to node that
            require grad (TODO: make optional)
    """
    if params is not None:
        assert isinstance(params.values()[0], Variable)
        param_map = {id(v): k for k, v in params.items()}

    node_attr = dict(style='filled',
                     shape='box',
                     align='left',
                     fontsize='12',
                     ranksep='0.1',
                     height='0.2')
    dot = Digraph(node_attr=node_attr, graph_attr=dict(size="12,12"))
    seen = set()

    def size_to_str(size):
        return '('+(', ').join(['%d' % v for v in size])+')'

    def add_nodes(var):
        if var not in seen:
            if torch.is_tensor(var):
                dot.node(str(id(var)), size_to_str(var.size()), fillcolor='orange')
            elif hasattr(var, 'variable'):
                u = var.variable
                name = param_map[id(u)] if params is not None else ''
                #node_name = '%s\n %s' % (name, size_to_str(u.size()))
                node_name = '%s' % (name)
                dot.node(str(id(var)), node_name, fillcolor='lightblue')
            else:
                dot.node(str(id(var)), str(type(var).__name__))
            seen.add(var)
            if hasattr(var, 'next_functions'):
                for u in var.next_functions:
                    if u[0] is not None:
                        dot.edge(str(id(u[0])), str(id(var)))
                        add_nodes(u[0])
            if hasattr(var, 'saved_tensors'):
                for t in var.saved_tensors:
                    dot.edge(str(id(t)), str(id(var)))
                    add_nodes(t)
    add_nodes(var.grad_fn)
    return dot

In [27]:
out2 = test_classifier(load_object("../Preprocessor/pkl/save_formatted_review_validation.pkl")[0:2])
print(out2.size())

print(1)
a = make_dot(out2)
print(2)
#a.render('graph.pdf', view=True)
print(3)
print(a)

torch.Size([2, 2])
1
2
3
digraph {
	graph [size="12,12"]
	node [align=left fontsize=12 height=0.2 ranksep=0.1 shape=box style=filled]
	140126523137384 [label=ConcatBackward]
	140126523137144 -> 140126523137384
	140126523137144 [label=SubConstantBackward]
	140126523136904 -> 140126523137144
	140126523136904 [label=SigmoidBackward]
	140126523136664 -> 140126523136904
	140126523136664 [label=ThresholdBackward]
	140126523136424 -> 140126523136664
	140126523136424 [label=AddmmBackward]
	140126432918048 -> 140126523136424
	140126432918048 [label="" fillcolor=lightblue]
	140126523135944 -> 140126523136424
	140126523135944 [label=ConcatBackward]
	140126523132568 -> 140126523135944
	140126523132568 [label=AddmmBackward]
	140126435608728 -> 140126523132568
	140126435608728 [label="" fillcolor=lightblue]
	140126523132088 -> 140126523132568
	140126523132088 [label=ThresholdBackward]
	140126523131848 -> 140126523132088
	140126523131848 [label=AddmmBackward]
	140126412278864 -> 140126523131848
	1401

In [None]:
a

In [37]:
a = ["ㄱ", "ㄴ", "ㄷ", "ㄹ", "ㅁ"]
b = [4, 3, 2, 1, 0]
c = [4, 2, 1, 0, 3]

d = [x for _,x in sorted(zip(c,a))]
print(d)
list.reverse(d)
print(d)

['ㄹ', 'ㄷ', 'ㄴ', 'ㅁ', 'ㄱ']
['ㄱ', 'ㅁ', 'ㄴ', 'ㄷ', 'ㄹ']


In [10]:
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
a_ = list()
for i in range(0, len(a), 3):
    a_.append(a[i:i+3])
print(a_)

[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]]
