In [1]:
!pip install boltons -q

[?25l[K     |██                              | 10kB 22.9MB/s eta 0:00:01[K     |████                            | 20kB 855kB/s eta 0:00:01[K     |██████                          | 30kB 1.3MB/s eta 0:00:01[K     |████████                        | 40kB 844kB/s eta 0:00:01[K     |██████████                      | 51kB 1.1MB/s eta 0:00:01[K     |████████████                    | 61kB 1.3MB/s eta 0:00:01[K     |██████████████                  | 71kB 1.5MB/s eta 0:00:01[K     |████████████████                | 81kB 1.7MB/s eta 0:00:01[K     |██████████████████              | 92kB 1.9MB/s eta 0:00:01[K     |████████████████████            | 102kB 1.4MB/s eta 0:00:01[K     |██████████████████████          | 112kB 1.4MB/s eta 0:00:01[K     |████████████████████████        | 122kB 1.4MB/s eta 0:00:01[K     |██████████████████████████      | 133kB 1.4MB/s eta 0:00:01[K     |████████████████████████████    | 143kB 1.4MB/s eta 0:00:01[K     |██████████████████████████

In [0]:
import string
from pathlib import Path
from textwrap import wrap
import numpy as np
import pandas as pd
from boltons.iterutils import windowed
from tqdm import tqdm, tqdm_notebook
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from google_drive_downloader import GoogleDriveDownloader as gdd

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
DATA_PATH = 'data/weight_loss/articles.jsonl'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI',
        dest_path='data/weight_loss/weight_loss_articles.zip',
        unzip=True,
    )

Downloading 1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI into data/weight_loss/weight_loss_articles.zip... Done.
Unzipping...Done.


In [15]:
def load_data(path, sequence_length=125):
  texts = pd.read_json(path).text.sample(100).str.lower().tolist()
  chars_windowed = [list(windowed(text, sequence_length)) for text in texts]
  all_chars_windowed = [sublst for lst in chars_windowed for sublst in lst]
  filtered_good_chars = [
      sequence for sequence in tqdm_notebook(all_chars_windowed)
      if all(char in string.printable for char in sequence)
  ]
  return filtered_good_chars


def get_unique_chars(sequences):
  return {sublst for lst in sequences for sublst in lst}

def create_char2idx(sequences):
  unique_chars = get_unique_chars(sequences)
  return {char : idx for idx, char in enumerate(sorted(unique_chars))}

def encode_sequence(sequence, char2idx):
  return [char2idx[char] for char in sequence]


def encode_sequences(sequences, char2idx):
  return np.array([
      encode_sequence(sequence, char2idx)
      for sequence in tqdm_notebook(sequences)
  ])


class Sequences(Dataset):
  def __init__(self, path, sequence_length=125):
    self.sequences = load_data(DATA_PATH, sequence_length=sequence_length)
    self.vocab_size = len(get_unique_chars(self.sequences))
    self.char2idx = create_char2idx(self.sequences)
    self.idx2char = {idx: char for char, idx in self.char2idx.items()}
    self.encoded = encode_sequences(self.sequences, self.char2idx)
    
    
  def __getitem__(self, i):
    return self.encoded[i, :-1], self.encoded[i, 1:]
  
  def __len__(self):
    return len(self.encoded)
  
  
dataset  = Sequences(DATA_PATH, sequence_length=128)
len(dataset)
trainloader = DataLoader(dataset, batch_size=4096)

HBox(children=(IntProgress(value=0, max=253457), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250433), HTML(value='')))

In [0]:
class RNN(nn.Module):
  def __init__(self, vocab_size, embedding_dimension=100, hidden_size=128, n_layers=1, device='cpu',):
    super(RNN, self).__init__()
    self.n_layers = n_layers
    self.hidden_size = hidden_size
    self.device = device
    self.encoder = nn.Embedding(vocab_size, embedding_dimension)
    self.rnn = nn.GRU(embedding_dimension, hidden_size, num_layers=n_layers, batch_first=True,)
    self.decoder = nn.Linear(hidden_size, vocab_size)
    
  def init_hidden(self, batch_size):
    return torch.randn(self.n_layers, batch_size, self.hidden_size).to(self.device)
  
  def forward(self, input_, hidden):
    encoded = self.encoder(input_)
    output, hidden = self.rnn(encoded.unsqueeze(1), hidden)
    output = self.decoder(output.squeeze(1))
    return output, hidden
  
  
model =  RNN(vocab_size=dataset.vocab_size, device=device).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
  filter(lambda p:p.requires_grad, model.parameters()), 
  lr=0.001,
)

In [18]:
print(model)

RNN(
  (encoder): Embedding(60, 100)
  (rnn): GRU(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=60, bias=True)
)


In [21]:
print('Trainable Parameters')
print('\n'.join([' * ' + x[0] for x in model.named_parameters() if x[1].requires_grad]))

Trainable Parameters
 * encoder.weight
 * rnn.weight_ih_l0
 * rnn.weight_hh_l0
 * rnn.bias_ih_l0
 * rnn.bias_hh_l0
 * decoder.weight
 * decoder.bias


In [22]:
model.train()
train_losses = []
for epoch in range(50):
  progress_bar = tqdm_notebook(trainloader, leave=False)
  losses = []
  total = 0
  for inputs, targets in progress_bar:
    batch_size = inputs.size(0)
    hidden = model.init_hidden(batch_size)
    
    model.zero_grad()
    
    loss = 0
    
    for char_idx in range(inputs.size(1)):
      output, hidden = model(inputs[:, char_idx].to(device), hidden)
      loss += criterion(output, targets[:, char_idx].to(device))
      
      
    loss.backward()
    optimizer.step()
    
    avg_loss = loss.item() / inputs.size(1)
    
    progress_bar.set_description(f'Loss: {avg_loss:.3f}')
    
    losses.append(avg_loss)
    total += 1
    
  epoch_loss = sum(losses) / total
  train_losses.append(epoch_loss)
  
  tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #1	Train Loss: 2.854


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #2	Train Loss: 2.317


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #3	Train Loss: 2.146


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #4	Train Loss: 2.027


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #5	Train Loss: 1.936


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #6	Train Loss: 1.863


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #7	Train Loss: 1.803


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #8	Train Loss: 1.753


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #9	Train Loss: 1.710


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #10	Train Loss: 1.674


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #11	Train Loss: 1.642


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #12	Train Loss: 1.614


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #13	Train Loss: 1.590


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #14	Train Loss: 1.567


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #15	Train Loss: 1.547


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #16	Train Loss: 1.529


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #17	Train Loss: 1.513


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #18	Train Loss: 1.498


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #19	Train Loss: 1.484


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #20	Train Loss: 1.471


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #21	Train Loss: 1.459


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #22	Train Loss: 1.448


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #23	Train Loss: 1.437


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #24	Train Loss: 1.428


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #25	Train Loss: 1.418


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #26	Train Loss: 1.410


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #27	Train Loss: 1.401


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #28	Train Loss: 1.394


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #29	Train Loss: 1.386


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #30	Train Loss: 1.379


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #31	Train Loss: 1.373


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #32	Train Loss: 1.366


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #33	Train Loss: 1.360


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #34	Train Loss: 1.355


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #35	Train Loss: 1.349


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #36	Train Loss: 1.344


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #37	Train Loss: 1.339


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #38	Train Loss: 1.334


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #39	Train Loss: 1.329


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #40	Train Loss: 1.325


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #41	Train Loss: 1.321


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #42	Train Loss: 1.316


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #43	Train Loss: 1.312


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #44	Train Loss: 1.308


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #45	Train Loss: 1.305


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #46	Train Loss: 1.301


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #47	Train Loss: 1.297


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #48	Train Loss: 1.294


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #49	Train Loss: 1.291


HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

Epoch #50	Train Loss: 1.287


In [23]:
def pretty_print(text):
  to_print =''
  for paragraph in text.split('\n'):
    to_print += '\n'.join(wrap(paragraph))
    to_print += '\n'
  print(to_print)
  
temperature = 1.0

model.eval()
seed = '\n'
text = ''

with torch.no_grad():
  batch_size = 1
  hidden = model.init_hidden(batch_size)
  last_char = dataset.char2idx[seed]
  for _ in range(1000):
    output, hidden = model(torch.LongTensor([last_char]).to(device), hidden)
    distribution = output.squeeze().div(temperature).exp()
    guess = torch.multinomial(distribution, 1).item()
    
    last_char = guess
    text += dataset.idx2char[guess]
    
    
pretty_print(text)

this is someone and disease peaking your body feel obstance though
that keepinity can have up), you menotuble the reglates then have the
body weight, for this abs of your lveating and dellieve high-resquited
drees revil perbobiages, whey appearined low. the most appeopnategers
are red can 'sinties product. but there are one full stard and what
for weight loss, you kee acking for before my diet prow a healthy
personally.
2. back and simple medication. you may be will almost burn before in
jump importance your meat is your meal, what diet posible as exercisk
to lose weight, you my vegetables. lost the elopmed food, there are a
fed solees from fat, it, you have fried leight would be exconstaning
is no love atubed (1 and help you readies up we eat morefoining in
food sort plitten is very will be convectibulate weight loss. people
alone in any dubue too and is your best with sare yourself stugh and
help you to be eating getting per risexintrested 4 back with your
diets it. well guided of a
