In [1]:
import pandas as pd
import time

#Personnalized libraries
from configs.config import DatasetConfig, HP
from data.DataLoader import build_dataloader
from utils.Errors import loss_estimation
from Procedures import Procedure
from model.lm import LanguageModel
from torch.utils.tensorboard import SummaryWriter

ds_config = DatasetConfig()
hp = HP()

  from .autonotebook import tqdm as notebook_tqdm


# Build DataLoaders

In [2]:
# instanciate dataloader for train, valid and test
train_iter, vocab, _ = build_dataloader(
    file_path=ds_config.train_data, 
    vocab_size=ds_config.vocab_size,
    vocab_min_freq=ds_config.min_freq,
    vocab=None,
    is_train=True,
    shuffle_batch=False,
    max_num_reviews=ds_config.max_num_reviews,
    refs_path=None,
    max_len_rev=ds_config.max_len_rev,
    pin_memory=ds_config.pin_memory,
    num_workers=ds_config.workers,
    batch_size=ds_config.batch_size,
    device=ds_config.device
)

train_size = len(train_iter)
train_size

Loading data: 100%|█████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 207.36item/s]


135 (6.32%) duplicated reviews added.
11 imbalanced batches found.


Build vocabulary: 100%|███████████████████████████████████████████████████████████| 2272/2272 [00:22<00:00, 101.61it/s]

Vocabulary size: 8677





284

In [3]:
valid_iter, _, valid_references = build_dataloader(
    file_path=ds_config.valid_data, 
    vocab_size=ds_config.vocab_size,
    vocab_min_freq=ds_config.min_freq,
    vocab=vocab,
    is_train=False,
    shuffle_batch=False,
    max_num_reviews=ds_config.max_num_reviews,
    refs_path=None,
    max_len_rev=ds_config.max_len_rev,
    pin_memory=ds_config.pin_memory,
    num_workers=ds_config.workers,
    batch_size=ds_config.batch_size,
    device=ds_config.device
)

valid_size = len(valid_iter)
valid_size

Loading data: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 146.90item/s]


43 (5.87%) duplicated reviews added.
4 imbalanced batches found.


97

In [4]:
test_iter, _, test_references = build_dataloader(
    file_path=ds_config.test_data, 
    vocab_size=ds_config.vocab_size,
    vocab_min_freq=ds_config.min_freq,
    vocab=vocab,
    is_train=False,
    shuffle_batch=False,
    max_num_reviews=15,#ds_config.max_num_reviews,
    refs_path=None,
    max_len_rev=ds_config.max_len_rev,
    pin_memory=ds_config.pin_memory,
    num_workers=ds_config.workers,
    batch_size=ds_config.batch_size,
    device=ds_config.device
)

test_size = len(test_iter)
test_size

Loading data: 100%|██████████████████████████████████████████████████████████████| 207/207 [00:00<00:00, 1280.73item/s]


0 (0.00%) duplicated reviews added.
0 imbalanced batches found.


207

# Train Model

In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from Procedures import Procedure
from model.lm import LanguageModel
from configs.config import DatasetConfig, HP

ds_config = DatasetConfig()
hp = HP()

In [6]:
from configs.config import follow

In [7]:
comment = follow["Name"]
writer = SummaryWriter(comment=comment)

In [8]:
procedure = Procedure(vocab, writer=writer, train_ter=train_iter, valid_iter=valid_iter)

### Train the Language model

In [9]:
procedure.train_lm()

.vector_cache\glove.6B.zip: 862MB [06:26, 2.23MB/s]                                                                    
100%|██████████████████████████████████████████████████████████████████████▉| 399999/400000 [00:22<00:00, 17573.27it/s]


Training Language Model for 300 epochs...


 77%|███████████████████████████████████████████████████████████                  | 218/284 [02:00<00:36,  1.81batch/s]


KeyboardInterrupt: 

In [44]:
reviews = procedure.generate(itr=train_iter, lm_path="./outputs/fullLM.batch_8_docs.180_epochs.Htilt.lr0.0001.pt", batch_idx=[1,2,3,4,5,6,7,8,9,10])

Loading Language Model from ./outputs/fullLM.batch_8_docs.180_epochs.Htilt.lr0.0001.pt


#### Save the reconstructed reviews

In [45]:
def match(og, rec):
    x = set(og.split())
    y = set(rec.split())
    z = x.intersection(y)
    return round(100 * len(z) / len(x), 2)

df = []

for rev in reviews:
    pid, rec_rev, og_rev = rev
    df.append({
        "dataset": "train",
        "prod_id": pid,
        "original": og_rev,
        "reconstructed": rec_rev,
        "% match": match(og_rev, rec_rev)
    })

df = pd.DataFrame(df)
df.to_csv("./lm_180epochs.reconst_reviews_train.full.csv", index=False)

### Train the summarizer

In [21]:
procedure.train(lm_path="./outputs/baseLM..batch_8_docs.300_epochs.baseH.lr0.0001.pt")

Loading Language Model from ./outputs/baseLM..batch_8_docs.300_epochs.baseH.lr0.0001.pt
Training Summarizer for 0 epochs...


In [32]:
#len(list(range(0, 1460, 5)))

292

In [25]:
summaries, hiddens, mean_hiddens = procedure.generate_summaries(itr=train_iter, batch_idx=None, path="outputs/summ.fullLM.batch_8_docs.lm_lr0.0001.0_epochs.dec_baseH.sum_baseH.lr1e-05.v3.pt")

Loading summarizer from outputs/summ.fullLM.batch_8_docs.lm_lr0.0001.0_epochs.dec_baseH.sum_baseH.lr1e-05.v3.pt


100%|██████████████████████████████████████████████████████████████████████████████| 284/284 [1:28:50<00:00, 18.77s/it]

completed after 5330.724611 seconds





#### Save the generated summaries

In [27]:
df = []
for e in summaries:
    prod_id = e[0]
    summary = e[1][0]
    df.append({
        "dataset": "train",
        #"model": "full AE. Rec w/ H_tilt. Finetune w/ Hhat. Cos sim w/ H. Gen w/ Hhat", 
        "prod_id": prod_id, 
        "summary": summary
    })
df = pd.DataFrame(df)
df.to_csv("./outputs/train_300epochs.summaries_v5_20221227__baseLM.no_finetuning.greedy_dec.gumbel.mini.v3.csv", index=False)

In [26]:
#torch.save(model.state_dict(), "saved_model.pt")

In [28]:
#model.load_state_dict(torch.load("saved_model.pt"))

<All keys matched successfully>

In [None]:
#! tensorboard --logdir=runs