# Setting

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

import MeCab

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader

In [2]:
print('Pytorch version: ', torch.__version__)
print('Currently selected device: ', torch.cuda.current_device())
print('# GPUs available: ', torch.cuda.device_count())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu') # デバッグ用
print(device)

Pytorch version:  1.1.0
Currently selected device:  0
# GPUs available:  2
cuda:0


# Dataset作成に必要なclassおよびfunction

## 分かち書き

In [3]:
dict_path = '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd'

def get_tango(sen):
    word_list = []
    tagger = MeCab.Tagger(dict_path)
    for word_line in tagger.parse(sen).split("\n"):
        if word_line.strip() == "EOS":
            break
        (word, temp) = word_line.split("\t")
        temps = temp.split(',')
        if "記号" == temps[0]:
            continue
        if "数" == temps[1]:
            continue
        word_list.append(word)
    return word_list

## torknize

In [4]:
def df2input(df, vocab_idx):
    data = []
    for text in df.values:
        words = get_tango(text)
        data.append([vocab_idx[word] for word in words if word in vocab_idx.keys()])
    return data

## Padding

In [5]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, tags):
        super(MyDataset, self).__init__()
        assert len(data) == len(tags)
        # npに変換し、0埋めを行う
        max_length = max([len(d) for d in data])
        self.data = np.zeros((len(tags), max_length))
        for i, d1 in enumerate(data):
            for l, d2 in enumerate(d1):
                self.data[i][l] = d2
        self.tags = tags

    def __len__(self):
        return len(self.tags)

    def __getitem__(self, index):
        return self.data[index], self.tags[index]

# Model

## LSTM

In [6]:
class LSTM(nn.Module):
    def __init__(self, batch_size, vocab_size, emb_dim, hidden_dim, dropout_rate=0.0, activate='tanh', bidirectional=False, device='cpu'):
        super(LSTM, self).__init__()
        
        self.vocab_size = vocab_size
        self.emb_dim    = emb_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.bidirectional = bidirectional
        self.activate   = activate
        
        self.emb  = nn.Embedding(self.vocab_size, self.emb_dim)
        self.lstm = nn.LSTM(self.emb_dim, self.hidden_dim, batch_first=True, bidirectional=self.bidirectional)
        
        self.fc0 = nn.Linear(hidden_dim * 2, 100)
        self.fc1 = nn.Linear(100, 2)
        self.do  = nn.Dropout(dropout_rate)
        self.device = device
        self.hidden = self.init_hidden()

    def forward(self, x):

        x = self.emb(x)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y = self.fc0(torch.cat([self.hidden[0][-1], self.hidden[0][-2]], 1))
        y = self.do(y)
        if self.activate == 'tanh':
            y = self.fc1(F.tanh(y))
        elif self.activate == 'relu':
            y = self.fc1(F.relu(y))
        tag_scores = F.log_softmax(y)
        return tag_scores

    def init_hidden(self):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        num = 2 if self.bidirectional else 1    # bidirectionalのとき2
        h0 = torch.zeros(num, self.batch_size, self.hidden_dim).to(self.device)
        c0 = torch.zeros(num, self.batch_size, self.hidden_dim).to(self.device)
        return (h0, c0)

## Train

In [7]:
def training(net, train_loader, epoch_num):

    for epoch in range(epoch_num):

        train_loss = 0.0
        train_acc  = 0.0

        # train====================
        net.train()
        for xx, yy in train_loader:
            xx, yy = xx.long().to(device), yy.to(device)

            net.batch_size = len(yy)
            net.hidden = net.init_hidden()

            optimizer.zero_grad()    # 勾配の初期化

            output = net(xx)
            loss   = criterion(output, yy)

            train_loss += loss.item()
            train_acc += (output.max(1)[1] == yy).sum().item()

            loss.backward(retain_graph=True)     # 逆伝播の計算
            optimizer.step()    # 勾配の更新

## Test

In [8]:
def test(net, test_loader, y_test):
    net.eval()
    y_pred = []
    with torch.no_grad():
        for xx, yy in test_loader:
            xx, yy = xx.long().to(device), yy.to(device)

            net.batch_size = len(yy)
            net.hidden = net.init_hidden()

            output = net(xx)
            y_pred += output.data.max(1, keepdim=True)[1].to('cpu').numpy()[:,0].tolist()

    acc = (y_pred == y_test).sum().item() / len(y_test)
    result = precision_recall_fscore_support(y_test, y_pred, average='macro')
    return [acc, result[0], result[1], result[2]]

# Results of GridSearch

In [9]:
gs_df = pd.read_csv('results/gridsearch_lstm_end2end.csv').sort_values(by=['f1'], ascending=False)
gs_df.head(3)

Unnamed: 0,epoch,batch_size,embedding_dim,hidden_dim,activate_func,learning_rate,l2_regular,dropout_rate,accuracy,precision,recall,f1
59,300,64,100,200,tanh,0.01,0.001,0.0,0.822355,0.818589,0.818205,0.818393
83,300,64,100,200,tanh,0.01,0.001,0.5,0.816367,0.812124,0.814167,0.813023
71,300,64,100,200,relu,0.01,0.001,0.0,0.814371,0.810422,0.810048,0.810231


In [10]:
gs_df = pd.read_csv('results/gridsearch_lstm_end2end.csv').sort_values(by=['accuracy'], ascending=False)
gs_df.head(3)

Unnamed: 0,epoch,batch_size,embedding_dim,hidden_dim,activate_func,learning_rate,l2_regular,dropout_rate,accuracy,precision,recall,f1
59,300,64,100,200,tanh,0.01,0.001,0.0,0.822355,0.818589,0.818205,0.818393
83,300,64,100,200,tanh,0.01,0.001,0.5,0.816367,0.812124,0.814167,0.813023
71,300,64,100,200,relu,0.01,0.001,0.0,0.814371,0.810422,0.810048,0.810231


# Experiments

## Case1
the random seed of train_test_split is 2019

In [11]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9394


In [12]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9394, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.8023952095808383, Precision: 0.7990276862228082, Recall: 0.7954345631573805, F1: 0.7969559148016392


In [14]:
torch.save(net.state_dict(), 'best_params/lstm_s2019.prm')
del net, criterion, optimizer, vocab_idx

In [15]:
net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s2019.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader

Accuracy: 0.8023952095808383, Precision: 0.7990276862228082, Recall: 0.7954345631573805, F1: 0.7969559148016392




## Case2
the random seed of train_test_split is 2020

In [11]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2020)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9383


In [12]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9383, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.688622754491018, Precision: 0.7788379073756433, Recall: 0.647406004080443, F1: 0.6260049000842203


In [13]:
torch.save(net.state_dict(), 'best_params/lstm_s2020.prm')
del net, criterion, optimizer, vocab_idx

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s2020.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader



Accuracy: 0.688622754491018, Precision: 0.7788379073756433, Recall: 0.647406004080443, F1: 0.6260049000842203


## Case3
the random seed of train_test_split is 1996

In [14]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1996)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9282


In [15]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9282, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.812375249500998, Precision: 0.8084306510958246, Recall: 0.8159205610539424, F1: 0.8099147534631406


In [16]:
torch.save(net.state_dict(), 'best_params/lstm_s1996.prm')
del net, criterion, optimizer

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s1996.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader



Accuracy: 0.812375249500998, Precision: 0.8084306510958246, Recall: 0.8159205610539424, F1: 0.8099147534631406


## Case4
the random seed of train_test_split is 1192

In [17]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1192)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9420


In [18]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9420, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.8083832335329342, Precision: 0.8034973034809609, Recall: 0.8034973034809609, F1: 0.8034973034809609


In [19]:
torch.save(net.state_dict(), 'best_params/lstm_s1192.prm')
del net, criterion, optimizer, vocab_idx

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s1192.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader

Accuracy: 0.8083832335329342, Precision: 0.8034973034809609, Recall: 0.8034973034809609, F1: 0.8034973034809609




## Case5
the random seed of train_test_split is 794

In [20]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=794)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9359


In [21]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9359, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.8083832335329342, Precision: 0.8044232922732363, Recall: 0.8118158877892115, F1: 0.8058703865155479


In [22]:
torch.save(net.state_dict(), 'best_params/lstm_s794.prm')
del net, criterion, optimizer, vocab_idx

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s794.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader



Accuracy: 0.8083832335329342, Precision: 0.8044232922732363, Recall: 0.8118158877892115, F1: 0.8058703865155479


## Case6
the random seed of train_test_split is 2000

In [23]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2000)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9413


In [24]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9413, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.8043912175648703, Precision: 0.801758476634606, Recall: 0.801163542340013, F1: 0.8014477515367195


In [25]:
torch.save(net.state_dict(), 'best_params/lstm_s2000.prm')
del net, criterion, optimizer, vocab_idx

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s2000.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader



Accuracy: 0.8043912175648703, Precision: 0.801758476634606, Recall: 0.801163542340013, F1: 0.8014477515367195


## Case7
the random seed of train_test_split is 1945

In [11]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1945)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9408


In [12]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9408, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.8043912175648703, Precision: 0.8148032426954906, Recall: 0.7981842905135179, F1: 0.7999070813771518


In [13]:
torch.save(net.state_dict(), 'best_params/lstm_s1945.prm')
del net, criterion, optimizer, vocab_idx

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s1945.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader

Accuracy: 0.8043912175648703, Precision: 0.8148032426954906, Recall: 0.7981842905135179, F1: 0.7999070813771518




## Case8
the random seed of train_test_split is 5748

In [11]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5748)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9360


In [12]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9360, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.782435129740519, Precision: 0.8004494198398431, Recall: 0.798314078016486, F1: 0.7824039208654594


In [13]:
torch.save(net.state_dict(), 'best_params/lstm_s5748.prm')
del net, criterion, optimizer, vocab_idx

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s5748.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader

Accuracy: 0.782435129740519, Precision: 0.8004494198398431, Recall: 0.798314078016486, F1: 0.7824039208654594




## Case9
the random seed of train_test_split is 7248

In [14]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7248)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9450


In [15]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9450, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.8283433133732535, Precision: 0.8213611969425922, Recall: 0.8266522299732204, F1: 0.8234990496165695


In [16]:
torch.save(net.state_dict(), 'best_params/lstm_s7248.prm')
del net, criterion, optimizer, vocab_idx

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s7248.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader

Accuracy: 0.8283433133732535, Precision: 0.8213611969425922, Recall: 0.8266522299732204, F1: 0.8234990496165695




## Case10
the random seed of train_test_split is 8787

In [11]:
train_df = pd.read_csv('/home/b2018yniki/data/nikkei/train.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
test_df  = pd.read_csv('/home/b2018yniki/data/nikkei/test.txt', sep='\t', header=None, names=['target', 'time', 'body']).drop('time', axis=1)
total_df = pd.concat([train_df, test_df], ignore_index=True).drop_duplicates()

X = total_df.body
y = total_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=8787)
del train_df, test_df, total_df, X, y

vocab = []
for text in X_train.values:
    vocab.extend(get_tango(text))
vocab = list(set(vocab))
print('vocabulaly size: {}'.format(len(vocab)))
vocab_idx = dict(zip(vocab, range(len(vocab))))
del vocab

X_train = df2input(X_train, vocab_idx)
X_test  = df2input(X_test, vocab_idx)

train_ds = MyDataset(X_train, y_train.values)
test_ds  = MyDataset(X_test, y_test.values)

del X_train, X_test

vocabulaly size: 9466


In [12]:
np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

# hyperparameter
epoch      = 300
batch_size = 64
vocab_size = len(vocab_idx)
emb_dim    = 100
hidden_dim = 200
activate   = 'tanh'
drop_rate  = 0.0
lr = 0.01
l2 = 0.001

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)

np.random.seed(2019)
np.random.RandomState(2019)
torch.manual_seed(2019)

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)

criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=l2)

print(net)

training(net, train_loader, epoch)
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

LSTM(
  (emb): Embedding(9466, 100)
  (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  (fc0): Linear(in_features=400, out_features=100, bias=True)
  (fc1): Linear(in_features=100, out_features=2, bias=True)
  (do): Dropout(p=0.0)
)




Accuracy: 0.8163672654690619, Precision: 0.8098249983614079, Recall: 0.8116018457481873, F1: 0.8106575963718821


In [13]:
torch.save(net.state_dict(), 'best_params/lstm_s8787.prm')
del net, criterion, optimizer, vocab_idx

net = LSTM(batch_size, vocab_size, emb_dim, hidden_dim, drop_rate, activate, bidirectional=True, device=device).to(device)
net.load_state_dict(torch.load('best_params/lstm_s8787.prm'))
result = test(net, test_loader, y_test)
print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(result[0], result[1], result[2], result[3]))

del net, train_ds, test_ds, train_loader, test_loader

Accuracy: 0.8163672654690619, Precision: 0.8098249983614079, Recall: 0.8116018457481873, F1: 0.8106575963718821


