<a href="https://colab.research.google.com/github/alexvanhalen/test/blob/master/%E2%80%9CMeeting_ipynb%E2%80%9D%E7%9A%84%E5%89%AF%E6%9C%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install torch



In [0]:
import numpy as np
import torch


def parse_corpus(path, seq_length=50):
    '''Parse raw corpus text into input-output pairs, where input is a sequence of characters, output is 1 character after the sequence'''

    # Read text
    with open(path, 'r') as f:
        raw_text = f.read().replace('\n', '')

    # Get unique characters
    chars = sorted(list(set(raw_text)))

    # Map char to int / int to char
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))
    
    # Prepare training data, for every <seq_length> chars, predict 1 char after the sequence
    n_chars = len(raw_text)
    dataX = [] # N x seq_length
    dataY = [] # N x 1
    for i in range(0, n_chars - seq_length):
        seq_in = raw_text[i:i + seq_length]
        seq_out = raw_text[i + seq_length]
        dataX.append([char_to_int[char] for char in seq_in])
        dataY.append(char_to_int[seq_out])
    
    return (dataX, dataY, char_to_int, int_to_char, chars)

def format_data(dataX, dataY, n_classes, batch_size=64):
    '''Parse into minibatches, return Tensors'''

    # For simplicity, discard trailing data not fitting into batch_size
    n_patterns = len(dataY)
    n_patterns = n_patterns - n_patterns % batch_size
    X = dataX[:n_patterns]
    Y = dataY[:n_patterns]

    # Parse X
    X = np.array(X)
    _, seq_length = X.shape
    X = X.reshape(-1, batch_size, seq_length)

    X = torch.LongTensor(X)

    # Parse Y
    Y = np.array(Y)
    Y = Y.reshape(-1, batch_size)

    Y = torch.LongTensor(Y)

    return list(zip(X, Y))

In [0]:
#import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


class Net(nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_dim, dropout=0.2):
        super(Net, self).__init__()

        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout)
        self.hidden2out = nn.Linear(hidden_dim, n_vocab)

    def forward(self, seq_in):
        embeddings = self.embeddings(seq_in.t()) # LSTM takes 3D inputs (timesteps, batch, features)
                                                 #                    = (seq_length, batch_size, embedding_dim)
        lstm_out, _ = self.lstm(embeddings)      # Each timestep outputs 1 hidden_state
                                                 # Combined in lstm_out = (seq_length, batch_size, hidden_dim) 
        ht = lstm_out[-1]                        # ht = last hidden state = (batch_size, hidden_dim)
                                                 # Use the last hidden state to predict the following character
        out = self.hidden2out(ht)                # Fully-connected layer, predict (batch_size, n_vocab)

        return out

In [0]:
import argparse
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

#from .data import parse_corpus, format_data
#from .model import Net


def load_data(path, seq_length, batch_size):
    dataX, dataY, char_to_int, int_to_char, chars = parse_corpus(path, seq_length=seq_length)
    data = format_data(dataX, dataY, n_classes=len(chars), batch_size=batch_size)

    return data, dataX, dataY, char_to_int, int_to_char, chars

def save_pickle(data, path):
    with open(path, 'wb') as f:
        pickle.dump(data, f)

def train(model, optimizer, epoch, data, log_interval):
    model.train()

    for batch_i, (seq_in, target) in enumerate(data):
        seq_in, target = Variable(seq_in), Variable(target)
        optimizer.zero_grad()

        output = model(seq_in)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()

        # Log training status
        if batch_i % log_interval == 0:
            print('Train epoch: {} ({:2.0f}%)\tLoss: {:.6f}'.format(epoch, 100. * batch_i / len(data), loss.data[0]))
            


In [0]:
#path = 'luen_yu_raw.txt'
#%debug
if __name__ == '__main__':
    # Parse arguments
    
    # Prepare
    train_data, dataX, dataY, char_to_int, int_to_char, chars = load_data('corpus.txt', seq_length=50, batch_size=32)
    model = Net(len(chars), 254, 256, dropout=0.2)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    # Train
    for epoch in range(30):
        train(model, optimizer, epoch, train_data, log_interval=10)

        if (epoch + 1) % 10 == 0:
            model.eval()
            torch.save(model, args.output)

    # Save mappings, vocabs, & model
    save_pickle((dataX, char_to_int, int_to_char, chars), args.output_c)

    model.eval()
    torch.save(model, args.output)

Train epoch: 0 ( 0%)	Loss: 7.627566
Train epoch: 0 ( 0%)	Loss: 7.674020
Train epoch: 0 ( 1%)	Loss: 7.634286
Train epoch: 0 ( 1%)	Loss: 7.669378
Train epoch: 0 ( 2%)	Loss: 7.654395
Train epoch: 0 ( 2%)	Loss: 7.622419
Train epoch: 0 ( 2%)	Loss: 7.627031
Train epoch: 0 ( 3%)	Loss: 7.619277
Train epoch: 0 ( 3%)	Loss: 7.612719
Train epoch: 0 ( 3%)	Loss: 7.607903
Train epoch: 0 ( 4%)	Loss: 7.622468
Train epoch: 0 ( 4%)	Loss: 7.618100
Train epoch: 0 ( 5%)	Loss: 7.562186
Train epoch: 0 ( 5%)	Loss: 7.547942
Train epoch: 0 ( 5%)	Loss: 7.474246
Train epoch: 0 ( 6%)	Loss: 7.491474
Train epoch: 0 ( 6%)	Loss: 7.583360
Train epoch: 0 ( 6%)	Loss: 7.518895
Train epoch: 0 ( 7%)	Loss: 6.980829
Train epoch: 0 ( 7%)	Loss: 6.337323
Train epoch: 0 ( 8%)	Loss: 6.308119
Train epoch: 0 ( 8%)	Loss: 6.115454
Train epoch: 0 ( 8%)	Loss: 7.084260
Train epoch: 0 ( 9%)	Loss: 6.520183
Train epoch: 0 ( 9%)	Loss: 6.078123
Train epoch: 0 (10%)	Loss: 5.934679
Train epoch: 0 (10%)	Loss: 5.729162
Train epoch: 0 (10%)	Loss: 6

NameError: ignored