# Recurrent Neural Networks




In [None]:
path_prefix = './'

### Download Dataset

In [None]:
!gdown --id '1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8' --output data.zip
!unzip data.zip
!ls

Downloading...
From: https://drive.google.com/uc?id=1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8
To: /content/data.zip
45.1MB [00:00, 87.7MB/s]
Archive:  data.zip
  inflating: training_label.txt      
  inflating: testing_data.txt        
  inflating: training_nolabel.txt    
data.zip     testing_data.txt	 training_nolabel.txt
sample_data  training_label.txt


In [None]:
# this is for filtering the warnings
import warnings
warnings.filterwarnings('ignore')

### Utils

In [None]:
# utils.py
# 這個 block 用來先定義一些等等常用到的函式
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F

def load_training_data(path='training_label.txt'):
    # 把 training 時需要的 data 讀進來
    # 如果是 'training_label.txt'，需要讀取 label，如果是 'training_nolabel.txt'，不需要讀取 label
    if 'training_label' in path:
        with open(path, 'r') as f:
            lines = f.readlines() # 所有的lines
            lines = [line.strip('\n').split(' ') for line in lines]
        x = [line[2:] for line in lines]
        y = [line[0] for line in lines]
        return x, y
    else:
        with open(path, 'r') as f:
            lines = f.readlines()
            x = [line.strip('\n').split(' ') for line in lines]
        return x

def load_testing_data(path='testing_data'):
    # 把 testing 時需要的 data 讀進來
    with open(path, 'r') as f:
        lines = f.readlines()
        X = ["".join(line.strip('\n').split(",")[1:]).strip() for line in lines[1:]] # 先刪除換行符號，再用逗號轉為小句子的list，把頭尾空格去掉後用join將list中的句子合併
        X = [sen.split(' ') for sen in X]
    return X

def evaluation(outputs, labels):
    # outputs => probability (float)
    # labels => labels
    outputs[outputs>=0.5] = 1 # 大於等於 0.5 為有惡意
    outputs[outputs<0.5] = 0 # 小於 0.5 為無惡意
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

### Train Word to Vector

In [None]:
import os
import numpy as np
import pandas as pd
import argparse
from gensim.models import word2vec

def train_word2vec(x):
    # 訓練 word to vector 的 word embedding
    model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
    # window: 一次取幾個詞來預測中間詞
    # min_count: 出現次數大於 x 才會被納入字典
    return model

if __name__ == "__main__":
    print("loading training data ...")
    train_x, y = load_training_data('training_label.txt')
    train_x_no_label = load_training_data('training_nolabel.txt')

    print("loading testing data ...")
    test_x = load_testing_data('testing_data.txt')

    #model = train_word2vec(train_x + train_x_no_label + test_x)
    model = train_word2vec(train_x + test_x)
    
    print("saving model ...")
    # model.save(os.path.join(path_prefix, 'model/w2v_all.model'))
    model.save(os.path.join(path_prefix, 'w2v_all.model'))

loading training data ...
loading testing data ...
saving model ...


### Data Preprocess

In [None]:
# preprocess.py
# 這個 block 用來做 data 的預處理
from torch import nn
from gensim.models import Word2Vec

class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
    def get_w2v_model(self):
        # 把之前訓練好的 word to vec 模型讀進來
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size
    def add_embedding(self, word): # 新增word
        # 把 word 加進 embedding，並賦予他一個隨機生成的 representation vector
        # word 只會是 "<PAD>" 或 "<UNK>"
        # 因為每個句子長度要一樣，因此做padding
        # 會遇到沒看過的字，做為unknown token
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector) # 從常態分佈取值填入vector中
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0) # cat 合併（已經轉為tensor型態，因此不再使用append）
    def make_embedding(self, load=True):
        print("Get embedding ...")
        # 取得訓練好的 Word2vec word embedding
        if load:
            print("loading word to vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        # 製作一個 word2idx 的 dictionary
        # 製作一個 idx2word 的 list
        # 製作一個 word2vector 的 list
        for i, word in enumerate(self.embedding.wv.vocab): # model.wv.vocab可列出所有詞彙
            print('get words #{}'.format(i+1), end='\r')
            #e.g. self.word2index['he'] = 1 
            #e.g. self.index2word[1] = 'he'
            #e.g. self.vectors[1] = 'he' vector
            self.word2idx[word] = len(self.word2idx) # 每個字的編號
            self.idx2word.append(word) # 每個字
            self.embedding_matrix.append(self.embedding[word]) # [word] 可回傳該word的vector
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        # 將 "<PAD>" 跟 "<UNK>" 加進 embedding 裡面
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix
    def pad_sequence(self, sentence):
        # 將每個句子變成一樣的長度
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len # 如果不相等則報錯
        return sentence
    def sentence_word2idx(self):
        # 把句子裡面的字轉成字典裡相對應的 index
        sentence_list = []
        for i, sen in enumerate(self.sentences): # 每個句子
            print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen: # 每個字
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            # 將每個句子變成一樣的長度
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    def labels_to_tensor(self, y):
        # 把 labels 轉成 tensor
        y = [int(label) for label in y]
        return torch.LongTensor(y)


### Dataset

In [None]:
# data.py
# 實作了 dataset 所需要的 '__init__', '__getitem__', '__len__'
# 好讓 dataloader 能使用
import torch
from torch.utils import data

class TwitterDataset(data.Dataset):
    """
    Expected data shape like:(data_num, data_len)
    Data can be a list of numpy array or a list of lists
    input data shape : (data_num, seq_len, feature_dim)
    seq_len 表示句子長度
    __len__ will return the number of data
    """
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

### Model

In [None]:
# model.py
# 這個 block 是要拿來訓練的模型
import torch
from torch import nn
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # 製作 embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        # 是否將 embedding fix 住，如果 fix_embedding 為 False，在訓練過程中，embedding 也會跟著被訓練
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional = True)
        # nn.lstm()接受的數據輸入為(序列長度，batch，輸入維數)，和cnn输入的方式不太一致，使用batch_first，可以將输入變為(batch，序列長度，輸入維數)
        self.classifier = nn.Sequential( nn.BatchNorm1d(hidden_dim * 2),
                                         nn.Dropout(dropout),
                                         nn.Linear(hidden_dim * 2, 1), # 把每個字產生的output轉成1維，代表句子
                                         nn.Sigmoid())
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # x 的 dimension (batch, seq_len, hidden_size)
        # 取用 LSTM 最後一層的 hidden state（最後的seq，已經把句子從頭到尾跑完）
        x = x[:, -1, :] 
        x = self.classifier(x)
        return x

### Train

In [None]:
# train.py
# 這個 block 是用來訓練模型的
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    total = sum(p.numel() for p in model.parameters()) # numel 合計參數值
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) # 要train的參數
    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    model.train() # 將 model 的模式設為 train，這樣 optimizer 就可以更新 model 的參數
    criterion = nn.BCELoss() # 定義損失函數，這裡我們使用 binary cross entropy loss
    # 二元分類專用
    t_batch = len(train) 
    v_batch = len(valid)   
    # momentum = 0.9 # 想成空氣阻力或摩擦力，通常設為0.9
    # optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    optimizer = optim.Adam(model.parameters(), lr=lr) # 將模型的參數給 optimizer，並給予適當的 learning rate
    total_loss, total_acc, best_acc = 0, 0, 0
    best_loss = 1000
    for epoch in range(n_epoch):
        # if epoch == 11:
        #    momentum = 0.9 # 想成空氣阻力或摩擦力，通常設為0.9
        #    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
        total_loss, total_acc = 0, 0
        # 這段做 training
        for i, (inputs, labels) in enumerate(train):
            inputs = inputs.to(device, dtype=torch.long) # device 為 "cuda"，將 inputs 轉成 torch.cuda.LongTensor
            labels = labels.to(device, dtype=torch.float) # device為 "cuda"，將 labels 轉成 torch.cuda.FloatTensor，因為等等要餵進 criterion，所以型態要是 float
            optimizer.zero_grad() # 由於 loss.backward() 的 gradient 會累加，所以每次餵完一個 batch 後需要歸零
            outputs = model(inputs) # 將 input 餵給模型
            outputs = outputs.squeeze() # 去掉最外面的 dimension，好讓 outputs 可以餵進 criterion()
            # 使其與labels之維度一致
            loss = criterion(outputs, labels) # 計算此時模型的 training loss
            loss.backward() # 算 loss 的 gradient
            optimizer.step() # 更新訓練模型的參數
            correct = evaluation(outputs, labels) # 計算此時模型的 training accuracy
            total_acc += (correct / batch_size)
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print(epoch)
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))

        # 這段做 validation
        model.eval() # 將 model 的模式設為 eval，這樣 model 的參數就會固定住
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long) # device 為 "cuda"，將 inputs 轉成 torch.cuda.LongTensor
                labels = labels.to(device, dtype=torch.float) # device 為 "cuda"，將 labels 轉成 torch.cuda.FloatTensor，因為等等要餵進 criterion，所以型態要是 float
                outputs = model(inputs) # 將 input 餵給模型
                outputs = outputs.squeeze() # 去掉最外面的 dimension，好讓 outputs 可以餵進 criterion()
                loss = criterion(outputs, labels) # 計算此時模型的 validation loss
                correct = evaluation(outputs, labels) # 計算此時模型的 validation accuracy
                total_acc += (correct / batch_size)
                total_loss += loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            if total_acc > best_acc:
                # or total_acc > best_acc total_loss < best_loss
                # 如果 validation 的結果優於之前所有的結果，就把當下的模型存下來以備之後做預測時使用
                best_loss = total_loss
                best_acc = total_acc
                #torch.save(model, "{}/val_acc_{:.3f}.model".format(model_dir,total_acc/v_batch*100))
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
        print('-----------------------------------------------')
        model.train() # 將 model 的模式設為 train，這樣 optimizer 就可以更新 model 的參數（因為剛剛轉成 eval 模式）

### Test

In [None]:
# test.py
# 這個 block 用來對 testing_data.txt 做預測
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1 # 大於等於 0.5 為負面
            outputs[outputs<0.5] = 0 # 小於 0.5 為正面
            ret_output += outputs.int().tolist()
    
    return ret_output

### Main

In [None]:
# main.py
import os
import torch
import argparse
import numpy as np
from torch import nn
from gensim.models import word2vec
from sklearn.model_selection import train_test_split

# 通過 torch.cuda.is_available() 的回傳值進行判斷是否有使用 GPU 的環境，如果有的話 device 就設為 "cuda"，沒有的話就設為 "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 處理好各個 data 的路徑
train_with_label = os.path.join(path_prefix, 'training_label.txt')
train_no_label = os.path.join(path_prefix, 'training_nolabel.txt')
testing_data = os.path.join(path_prefix, 'testing_data.txt')

w2v_path = os.path.join(path_prefix, 'w2v_all.model') # 處理 word to vec model 的路徑

# 定義句子長度、要不要固定 embedding、batch 大小、要訓練幾個 epoch、learning rate 的值、model 的資料夾路徑
sen_len = 35 # 如果句子長度不夠，資訊會遺失（training data中最長的句子有39個字）
fix_embedding = True # fix embedding during training （否則embedding層也會訓練）
batch_size = 128
epoch = 15
lr = 0.001
# model_dir = os.path.join(path_prefix, 'model/') # model directory for checkpoint model
# model directory for checkpoint model

print("loading data ...") # 把 'training_label.txt' 跟 'training_nolabel.txt' 讀進來
train_x, y = load_training_data(train_with_label)
train_x_no_label = load_training_data(train_no_label)

# 對 input 跟 labels 做預處理
preprocess = Preprocess(train_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True) # return embedding matrix 字典 (vector of words) 
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)

# 製作一個 model 的對象
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
# 150個 LSTM cell
model = model.to(device) # device為 "cuda"，model 使用 GPU 來訓練（餵進去的 inputs 也需要是 cuda tensor）

# 把 data 分為 training data 跟 validation data（將一部份 training data 拿去當作 validation data）
X_train, X_val, y_train, y_val = train_x[:180000], train_x[180000:], y[:180000], y[180000:]

# 把 data 做成 dataset 供 dataloader 取用
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)

# 把 data 轉成 batch of tensors
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 8)

val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)

# 開始訓練
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device)

loading data ...
Get embedding ...
loading word to vec model ...
get words #24694
total words: 24696

start training, parameter total:6657301, trainable:483301

0

Train | Loss:0.49425 Acc: 75.768
Valid | Loss:0.89588 Acc: 60.077 
saving model with acc 60.077
-----------------------------------------------
1

Train | Loss:0.43763 Acc: 79.926
Valid | Loss:0.49049 Acc: 75.916 
saving model with acc 75.916
-----------------------------------------------
2

Train | Loss:0.42366 Acc: 80.681
Valid | Loss:0.43950 Acc: 79.538 
saving model with acc 79.538
-----------------------------------------------
3

Train | Loss:0.41081 Acc: 81.398
Valid | Loss:0.43068 Acc: 79.842 
saving model with acc 79.842
-----------------------------------------------
4

Train | Loss:0.39876 Acc: 81.996
Valid | Loss:0.41525 Acc: 80.842 
saving model with acc 80.842
-----------------------------------------------
5

Train | Loss:0.38930 Acc: 82.532
Valid | Loss:0.40889 Acc: 81.235 
saving model with acc 81.235
-----

In [None]:
embedding.size(1)

250

In [None]:
embedding

tensor([[ 0.3161,  0.2871,  0.1874,  ...,  0.3063,  0.2797, -0.5200],
        [ 0.3618,  0.1333, -0.2412,  ...,  0.1108, -0.0870,  0.0882],
        [ 0.1049, -0.2132, -0.0784,  ..., -0.0038, -0.0189, -0.0702],
        ...,
        [ 0.1167, -0.1394,  0.2990,  ...,  0.1890,  0.2447,  0.1652],
        [ 0.2917,  0.1253,  0.9942,  ...,  0.2789,  0.0522,  0.5639],
        [ 0.6998,  0.8665,  0.8057,  ...,  0.3931,  0.8750,  0.0956]])

In [None]:
model = torch.load(os.path.join(model_dir, 'ckpt.model'))
training(batch_size, 100, 0.0001, model_dir, train_loader, val_loader, model, device)

# Semi-supervising

In [None]:
train_x_no_label = load_training_data(train_no_label)
preprocess_no_label = Preprocess(train_x_no_label , sen_len, w2v_path=w2v_path)
embedding = preprocess_no_label.make_embedding(load=True) # return embedding matrix 字典 (vector of words) 
train_x_no_label = preprocess_no_label.sentence_word2idx()

no_label_dataset = TwitterDataset(X=train_x_no_label, y=None)
no_label_loader = torch.utils.data.DataLoader(dataset = no_label_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)

Get embedding ...
loading word to vec model ...
get words #24694
total words: 24696


In [None]:
# test.py
# 這個 block 用來對 testing_data.txt 做預測
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

model.eval()
ret_output = []
index = []
answer = []
with torch.no_grad():
    for i, inputs in enumerate(no_label_loader):
        inputs = inputs.to(device, dtype=torch.long)
        outputs = model(inputs)
        outputs = outputs.squeeze()
        outputs[outputs>=0.8] = 1 # 信心足夠者
        outputs[outputs<=0.2] = 0 
        for j in range(len(outputs)):
          if outputs[j] == 1 or outputs[j] == 0:
            index.append(128 * i + j)
            answer.append(outputs[j])
        ret_output += outputs.int().tolist()

In [None]:
len(answer)

823811

In [None]:
X_train, X_val, y_train, y_val = train_x[:180000], train_x[180000:], y[:180000], y[180000:]

In [None]:
for i in range(len(answer)):
  answer[i] = int(answer[i])

In [None]:
select = []
tlist = train_x_no_label.tolist()
for i in index:
  select.append(tlist[i])

In [None]:
X_train_no_label = torch.LongTensor(select)

In [None]:
answer = torch.LongTensor(answer)

In [None]:
 X_train = torch.cat((X_train, X_train_no_label))
 y_train = torch.cat((y_train, answer))

In [None]:
X_train.size()

torch.Size([1003811, 35])

In [None]:
new_model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
# 150個 LSTM cell
new_model = new_model.to(device) # device為 "cuda"，model 使用 GPU 來訓練（餵進去的 inputs 也需要是 cuda tensor）

# 把 data 做成 dataset 供 dataloader 取用
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)

# 把 data 轉成 batch of tensors
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 8)

val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)

# 開始訓練
training(batch_size, epoch*2, 0.001, model_dir, train_loader, val_loader, new_model, device)


start training, parameter total:6657301, trainable:483301

0

Train | Loss:0.18045 Acc: 93.469
Valid | Loss:0.51955 Acc: 81.175 
saving model with acc 81.175
-----------------------------------------------
1

Train | Loss:0.12622 Acc: 96.017
Valid | Loss:0.65082 Acc: 78.886 
-----------------------------------------------
2

Train | Loss:0.11297 Acc: 96.495
Valid | Loss:0.51686 Acc: 81.623 
saving model with acc 81.623
-----------------------------------------------
3

Train | Loss:0.10430 Acc: 96.791
Valid | Loss:0.54084 Acc: 81.713 
saving model with acc 81.713
-----------------------------------------------
4

Train | Loss:0.09647 Acc: 97.032
Valid | Loss:0.55806 Acc: 81.608 
-----------------------------------------------
5

Train | Loss:0.08953 Acc: 97.260
Valid | Loss:0.64781 Acc: 81.131 
-----------------------------------------------
6

Train | Loss:0.08332 Acc: 97.441
Valid | Loss:0.58803 Acc: 81.613 
-----------------------------------------------
7

Train | Loss:0.07723 Acc

In [None]:
training(batch_size, epoch, 0.001, model_dir, train_loader, val_loader, new_model, device)


start training, parameter total:6657301, trainable:483301

0

Train | Loss:0.13389 Acc: 95.404
Valid | Loss:0.47087 Acc: 81.643 
saving model with acc 81.643
-----------------------------------------------
1

Train | Loss:0.11993 Acc: 95.933
Valid | Loss:0.47456 Acc: 81.673 
saving model with acc 81.673
-----------------------------------------------
2

Train | Loss:0.11072 Acc: 96.241
Valid | Loss:0.48294 Acc: 81.633 
-----------------------------------------------
3

Train | Loss:0.10203 Acc: 96.497
Valid | Loss:0.50528 Acc: 81.459 
-----------------------------------------------
4

Train | Loss:0.09492 Acc: 96.743
Valid | Loss:0.48954 Acc: 81.768 
saving model with acc 81.768
-----------------------------------------------
5

Train | Loss:0.08738 Acc: 97.005
Valid | Loss:0.53290 Acc: 81.245 
-----------------------------------------------
6

Train | Loss:0.08015 Acc: 97.264
Valid | Loss:0.57743 Acc: 81.305 
-----------------------------------------------
7

Train | Loss:0.07275 Acc

### Predict and Write to csv file

In [None]:
# 開始測試模型並做預測
print("loading testing data ...")
test_x = load_testing_data(testing_data)
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)
print('\nload model ...')
model = torch.load(os.path.join(model_dir, 'ckpt.model'))
outputs = testing(batch_size, test_loader, model, device)

# 寫到 csv 檔案供上傳 Kaggle
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")

loading testing data ...
Get embedding ...
loading word to vec model ...
get words #24694
total words: 24696
sentence count #200000
load model ...
save csv ...
Finish Predicting


In [None]:
!pwd
!ls
# check where the files are

/content
ckpt.model  predict.csv  testing_data.txt    training_nolabel.txt
data.zip    sample_data  training_label.txt  w2v_all.model


In [None]:
from google.colab import files
files.download('predict.csv')
# download to computer