In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from functools import reduce
import collections
import numpy as np
import time
import os
import random
import math

In [2]:
import gensim

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
cd drive/My Drive/Colab Notebooks/Language Model

/content/drive/My Drive/Colab Notebooks/Language Model


In [6]:
!pip install import-ipynb
import import_ipynb



In [7]:
from Gensim import w2v_model, reduce_dimensions

importing Jupyter notebook from Gensim.ipynb


Data

In [8]:
txt = "./TKH1020.txt"
with(open(txt, "r", encoding="utf-8")) as f:
     total = f.readlines()

In [9]:
train, test_data = train_test_split(total, test_size=0.2, shuffle=True, random_state=42)
train_data, val_data = train_test_split(train, test_size=0.25, shuffle=True, random_state=42)

In [10]:
def spliting(data):
    words = []

    for lines in data:
        word = []
        for line in lines:
            if line == '\n':
                continue
            else:
                word.append(line)
        words.append(word)
    
    return words

In [11]:
train_data = spliting(train_data)
val_data = spliting(val_data)
test_data = spliting(test_data)

Word2vec

In [12]:
embedding_dim = 200

In [13]:
wv_model = w2v_model(train_data, vector_size=embedding_dim, window_size=4)

In [14]:
wv_model.save("w2v_model.model")

In [15]:
#wv_model = gensim.models.Word2Vec.load('w2v_model.model')
#weights = torch.FloatTensor(wv_model.wv.vectors)

In [16]:
#reduce_dimensions(wv_model)

In [17]:
avg_vec = (sum(wv_model.wv.vectors)/len(wv_model.wv.vectors)).reshape(1,-1)

In [18]:
wv_model.wv.vectors = np.concatenate((wv_model.wv.vectors, avg_vec), axis=0)

In [19]:
weights = torch.FloatTensor(wv_model.wv.vectors)

In [20]:
idx_to_word = [word for word in wv_model.wv.vocab]
idx_to_word.append('UNK')
word_to_idx = {word: idx for idx, word in enumerate(idx_to_word)}

In [21]:
vocab = set(idx_to_word)
vocab_size = len(vocab)

In [22]:
print(vocab_size)

5457


Vocab Size = 5457

DataSet/DataLoader

In [23]:
CONTEXT_SIZE = 4

In [24]:
def unk(word, vocab):
    if word not in vocab:
        return 'UNK'
    else:
        return word

In [25]:
def cleaning(data, vocab):
  
    for idx, line in enumerate(data):
        for idx2, word in enumerate(line):
            data[idx][idx2] = unk(word, vocab)

    return data

In [26]:
train = cleaning(train_data, vocab)
val = cleaning(val_data, vocab)
test = cleaning(test_data, vocab)

In [27]:
def ngrams(data):
    label = []
    contexts = []
    
    for line in data:
        if len(line)<5:
            continue
        for idx in range(0, len(line)-CONTEXT_SIZE):
            context = [word_to_idx[line[idx]], word_to_idx[line[idx+1]], word_to_idx[line[idx+3]], word_to_idx[line[idx+4]]]
            contexts.append(torch.tensor(context, dtype=torch.long))
            label.append(torch.tensor(word_to_idx[line[idx+2]], dtype=torch.long))
  
    return label,contexts

In [28]:
train_label, train_contexts = ngrams(train)
val_label, val_contexts = ngrams(val)
test_label, test_contexts = ngrams(test)

In [29]:
class word2vec_dataset(Dataset):
    def __init__(self, label, contexts):
        self.label = label
        self.contexts = contexts
        
    def __getitem__(self, index): 
        
        x = self.label[index]
        y = self.contexts[index]

        return (x, y)

    def __len__(self):
        return len(self.label) 

In [30]:
train_set = word2vec_dataset(train_label, train_contexts)
val_set = word2vec_dataset(val_label, val_contexts)
test_set = word2vec_dataset(test_label, test_contexts)

In [31]:
print(len(train_label))
print(len(train_contexts))

415953
415953


In [32]:
train_loader = DataLoader(train_set, batch_size = 5000, shuffle = True)
val_loader = DataLoader(val_set, batch_size = 5000, shuffle = True)
test_loader = DataLoader(test_set, batch_size = 5000, shuffle = True)

Model

In [33]:
class Net(nn.Module):
    def __init__(self, n_vocab, embedding_dim, num_layers, hidden_dim, dropout=0.2):
        super(Net, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embeddings = nn.Embedding.from_pretrained(weights)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, bidirectional = True)
        self.fc = nn.Linear(hidden_dim*2, n_vocab)

    def forward(self, seq_in):
        h0 = torch.zeros(self.num_layers * 2, seq_in.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers * 2, seq_in.size(0), self.hidden_dim).to(device)

        embeddings = self.embeddings(seq_in.t())
        out, _ = self.lstm(embeddings, (h0, c0))
        out = self.fc(out[-1, :, :])

        return out

In [34]:
def train(model, optimizer, epoch, data, log_interval=100):

    model.train()
    cnt = 0.0
    acc = 0.0

    for batch_i, (target, seq_in) in enumerate(data):
        
        seq_in, target = Variable(seq_in).to(device), Variable(target).to(device)
        optimizer.zero_grad()
        output = model(seq_in)              
        loss = F.cross_entropy(output, target)  
        loss.backward()                         
        optimizer.step()          

        for idx, out in enumerate(output): 
            cnt += 1
            if out.argmax().item() == target[idx].item():
                acc += 1      
             
        if batch_i % log_interval == 0 or batch_i == len(data)-1:
            train_acc = acc*100/cnt 
            print('Train epoch: {}, Batchs: {}/{}, Loss: {:.6f}, Acc: {:.2f}%'.format(epoch, batch_i+1, len(data), loss.item(), train_acc))
            cnt = 0.0
            acc = 0.0

In [35]:
def val(epoch, model, data):
    model.eval()
    acc = 0.0
    avg_loss = 0.0
    cnt=0
    label = []
    pred = []

    for batch_i, (target, seq_in) in enumerate(data):
        seq_in, target = Variable(seq_in).to(device), Variable(target).to(device)
        output = model(seq_in)           
        loss = F.cross_entropy(output, target)     
        avg_loss += loss.item()

        for idx, out in enumerate(output): 
            cnt += 1
            label.append(target[idx].item())
            pred.append(out.argmax().item())
            if out.argmax().item() == target[idx].item():
                acc += 1      

    val_acc = acc*100/cnt
    avg_loss = avg_loss/len(data)
    print('Train epoch: {}, Loss: {:.5f}, Acc: {:.2f}%'.format(epoch, avg_loss, val_acc))

    return label, pred, acc, avg_loss

In [36]:
model = Net(vocab_size, embedding_dim=embedding_dim, hidden_dim=1000 ,num_layers=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003) # 0.015, 0.003
epochs = 15

500 < Best < 2000

In [37]:
print(model)

Net(
  (embeddings): Embedding(5457, 200)
  (lstm): LSTM(200, 1000, num_layers=2, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=2000, out_features=5457, bias=True)
)


In [38]:
#model = torch.load('predict_next05.pth')

In [39]:
best_loss = 100
start = time.time()

for epoch in range(epochs):
    train(model, optimizer, epoch, train_loader)
    val_label, val_pred, val_acc, val_loss = val(epoch, model, val_loader)

    if val_loss < best_loss:
        model.eval()
        torch.save(model, "predict_next05.pth")
        best_loss = val_loss

    print("Train epoch: {}, Durations: {} min".format(epoch, (time.time() - start)//60))

Train epoch: 0, Batchs: 1/84, Loss: 8.604317, Acc: 0.02%
Train epoch: 0, Batchs: 84/84, Loss: 3.198402, Acc: 31.66%
Train epoch: 0, Loss: 3.18646, Acc: 48.76%
Train epoch: 0, Durations: 2.0 min
Train epoch: 1, Batchs: 1/84, Loss: 3.048089, Acc: 49.16%
Train epoch: 1, Batchs: 84/84, Loss: 2.722587, Acc: 52.86%
Train epoch: 1, Loss: 2.71901, Acc: 55.46%
Train epoch: 1, Durations: 5.0 min
Train epoch: 2, Batchs: 1/84, Loss: 2.188438, Acc: 58.92%
Train epoch: 2, Batchs: 84/84, Loss: 2.166829, Acc: 59.66%
Train epoch: 2, Loss: 2.59317, Acc: 57.90%
Train epoch: 2, Durations: 8.0 min
Train epoch: 3, Batchs: 1/84, Loss: 1.627130, Acc: 67.40%
Train epoch: 3, Batchs: 84/84, Loss: 1.617595, Acc: 66.54%
Train epoch: 3, Loss: 2.60033, Acc: 59.15%
Train epoch: 3, Durations: 11.0 min
Train epoch: 4, Batchs: 1/84, Loss: 1.073763, Acc: 77.18%


KeyboardInterrupt: ignored

Testing

In [None]:
#model = torch.load("predict_next.pth").to(device)

In [40]:
def test(model, data, log_interval=10):

    model.eval()
    acc=0.0
    cnt=0
    label = []
    pred = []

    for batch_i, (target, seq_in) in enumerate(data):
        seq_in, target = Variable(seq_in).to(device), Variable(target).to(device)
        output = model(seq_in)                  

        for idx, out in enumerate(output): 
            cnt += 1
            label.append(target[idx].item())
            pred.append(out.argmax().item())
            if out.argmax().item() == target[idx].item():
                acc += 1                             

        if batch_i % log_interval == 0 or batch_i == len(data)-1:
            print('Batchs: {}/{}'.format(batch_i+1, len(data)))

    test_acc = acc*100/cnt
    print("Test Accuracy: {:.2f}%".format(test_acc))

    return label, pred

In [41]:
label, pred = test(model, test_loader)

Batchs: 1/29
Batchs: 11/29
Batchs: 21/29
Batchs: 29/29
Test Accuracy: 58.21%


In [None]:
import sklearn.metrics as metrics

In [None]:
import pandas as pd

In [None]:
from itertools import chain 
predictions_l = [idx_to_word[val_pred[i]] for i in range(len(val_pred))]
labels_l = [idx_to_word[val_label[i]]for i in range(len(val_label))]

In [None]:
table = pd.DataFrame(metrics.classification_report(labels_l, predictions_l, output_dict=True)).transpose()
table.to_csv("pred_test2.csv", index=True, encoding='utf_8_sig')