In [23]:
from torch.utils.data import Dataset, DataLoader
import os
import random
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [13]:
### Choose articles and save to a file

rootdir = '../articles'
allarticles = os.listdir(rootdir)

num_train = 10000
num_val = 5000
total_articles = random.sample(allarticles, num_train + num_val)
train_articles = total_articles[:num_train]
val_articles = total_articles[num_train:]
with open('train_files.txt', 'w') as f:
    f.write('\n'.join(train_articles))
with open('val_files.txt', 'w') as f:
    f.write('\n'.join(val_articles))

In [14]:
batch_size = 10
num_workers = 4

ARTICLE_MAX_TOKENS = 400
SUMMARY_MAX_TOKENS = 100

NUM_HIDDEN_STATES = 256
EMBEDDING_SIZE = 128

In [20]:
class wikihowDataset(Dataset):
    
    def __init__(self, dtype):
        
        rootdir = '../articles'
        if dtype == 'train':
            with open('train_files.txt', 'r') as f:
                paths = [os.path.join(rootdir,x.rstrip('\n')) for x in f.readlines()]
        elif dtype == 'val':
            with open('val_files.txt', 'r') as f:
                paths = [os.path.join(rootdir,x.rstrip('\n')) for x in f.readlines()]
        else:
            print('invalid dtype of dataset', 1/0)
        
        self.articles = []
        self.summaries = []
        self.titles = []
        self.vocab = set()
        
        for path in paths:
            with open(path, 'r') as f:
                summary = ''
                article = ''
                isarticle = False
                
                title = path.split('/')[2].split('.')[0]
                self.titles.append(title)
                
                for line in [x.rstrip('\n') for x in f.readlines()]:
                    if line == '\n' or line == '@summary':
                        continue
                        
                    if line == '@article':
                        isarticle = True
                        continue
                        
                    if isarticle:
                        article += line
                    else:
                        summary += line
                        
                article = ' '.join(article.split()[:ARTICLE_MAX_TOKENS])
                summary = ' '.join(summary.split()[:SUMMARY_MAX_TOKENS])
                self.articles.append(article)
                self.summaries.append(summary)
                
        ### build vocabulary
        for i in range(len(self.articles)):
            for word in self.articles[i].split():
                self.vocab.add(word)
            for word in self.summaries[i].split():
                self.vocab.add(word)
                
    def mapandpad(self, WORD_TO_IDX):
        self.mappedarticles = [[WORD_TO_IDX[word] for word in article.split()] for article in self.articles]
        self.mappedsummaries = [[WORD_TO_IDX[word] for word in summary.split()] for summary in self.summaries]
        
        pad_token = WORD_TO_IDX['<PAD>']
        self.paddedarticles = np.ones((len(self.articles), ARTICLE_MAX_TOKENS)) * pad_token
        self.paddedsummaries = np.ones((len(self.summaries), SUMMARY_MAX_TOKENS)) * pad_token
        self.articlelengths = [len(ma) for ma in self.mappedarticles]
        self.summarylengths = [len(ma) for ma in self.mappedsummaries]
        
        for i, x_len in enumerate(self.articlelengths):
            sequence = self.mappedarticles[i]
            self.paddedarticles[i, :x_len] = sequence[:x_len]
            
        for i, y_len in enumerate(self.summarylengths):
            sequence = self.mappedsummaries[i]
            self.paddedsummaries[i, :y_len] = sequence[:y_len]
        
    def __len__(self):
        return len(self.articles)
    
    def __getitem__(self, idx):
        return (self.paddedarticles[idx], self.paddedsummaries[idx])

In [28]:
train_dataset = wikihowDataset('train')
val_dataset = wikihowDataset('val')

vocab = train_dataset.vocab | val_dataset.vocab
vocab.add('<PAD>')
vocab = list(vocab)
WORD_TO_IDX = {word: i for i, word in enumerate(vocab)}
PADDING_IDX = WORD_TO_IDX['<PAD>']

train_dataset.mapandpad(WORD_TO_IDX)
val_dataset.mapandpad(WORD_TO_IDX)

print(train_dataset.articles[0])
print(train_dataset.mappedarticles[0])
print(train_dataset.paddedarticles[0])
print(len(train_dataset.paddedarticles[0]))

print(val_dataset.articles[0])
print(val_dataset.mappedarticles[0])
print(val_dataset.paddedarticles[0])
print(len(val_dataset.paddedarticles[0]))

train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=num_workers)



Sunburn can cause cancer, blisters, cracking, and peeling.Use lip balm that contains sunscreen (at least 15 SPF), or a sunscreen on your lips. Use this every day to help protect your lips against the sun., Make sure your lip balm is free of anything you might be allergic to. This may cause an adverse reaction. Cleansers can contain chemicals such as salicylic acid or benzoyl peroxide which can cause dryness and, in some cases, allergic reactions., Cucumbers hold a lot of moisture and can be great for your lips.Consider a moisturizer that uses cucumber.You can also apply a cucumber directly for your lips to 3-5 minutes. This will allow your lips to soak up the moisture from the vegetable. Vitamin A and zinc are also important for you body. These vitamins help your immune system and help your skin stay healthy.
[203932, 150102, 55293, 232246, 288391, 165640, 271178, 145359, 137733, 261860, 245503, 6885, 257926, 209830, 102862, 113669, 224374, 161621, 58004, 257926, 103889, 70326, 60979, 

In [29]:
print(train_dataset.paddedarticles[0].shape)
print(train_dataset.paddedarticles.shape)

(400,)
(10000, 400)


In [None]:
class Encoder(nn.Module):
    
    def __init__(self):
        super(Encoder, self).__init__()
        
        self.embeddings = nn.Embedding(
            num_embeddings=len(vocab), 
            embedding_dim=EMBEDDING_SIZE, 
            padding_idx=PADDING_IDX
        )
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=NUM_HIDDEN_STATES, 
            bidirectional=True
        )
        
    def forward(self, x, x_lengths):
        # x (400, batch_size, 1)
        
        x = self.embeddings(x)
        # (400, batch_size, 128)
        
        x = nn.utils.rnn.pack_padded_sequence(x, x_lengths)
        
        x = self.lstm(x)