In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path

sys.path.append("..")

In [4]:
import torch
import transformers
import numpy as np
import pandas as pd
from transformers import BertTokenizerFast

In [11]:
import settings
from funcs.utils import find_project_root

In [29]:
from funcs.haiku_data_module import DataModule

In [7]:
ROOT = find_project_root()

In [8]:
MODEL_NAME = settings.english_bert_model_name

In [9]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
tokenizer

<transformers.tokenization_bert.BertTokenizerFast at 0x7f94ee105210>

In [13]:
haiku_corpus = pd.read_csv(ROOT / settings.path_to_haiku_corpus)
haiku_corpus

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,23,5,5
2,spring moonset --,a rice ball for,breakfast,tempslibres,34,4,2
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4
...,...,...,...,...,...,...,...
143132,I'm not asking did,you say it nor clarify,what you said neither,twaiku,5,7,5
143133,You are truly a,moron or a liar I'm,inclined to think both,twaiku,5,7,5
143134,Ain't no selfie on,this earth that's gonna make me,like Theresa May,twaiku,5,7,5
143135,is doing a great,job turning Independents,into Democrats,twaiku,5,7,5


In [19]:
df_haiku = haiku_corpus
# remove ambivalent syllables
for col in ["0_syllables", "1_syllables", "2_syllables"]:
    df_haiku = df_haiku.assign(remove=df_haiku[col].apply(lambda cell: "," in cell))
    df_haiku = df_haiku[~df_haiku["remove"]]
df_haiku

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables,remove
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2,False
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4,False
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4,False
5,quitting time,the smell of rain,in the lobby,tempslibres,3,4,4,False
7,overnight rain --,the scent of orange blossoms,in a desert town,tempslibres,4,7,5,False
...,...,...,...,...,...,...,...,...
143132,I'm not asking did,you say it nor clarify,what you said neither,twaiku,5,7,5,False
143133,You are truly a,moron or a liar I'm,inclined to think both,twaiku,5,7,5,False
143134,Ain't no selfie on,this earth that's gonna make me,like Theresa May,twaiku,5,7,5,False
143135,is doing a great,job turning Independents,into Democrats,twaiku,5,7,5,False


In [20]:
top_line_length = int(
    max([df_haiku[str(_)].str.len().quantile(0.99) for _ in range(3)])
)
print(top_line_length)
for col in ["0", "1", "2"]:
    df_haiku = df_haiku[df_haiku[col].str.len() <= top_line_length]
df_haiku

54


Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables,remove
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2,False
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4,False
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4,False
5,quitting time,the smell of rain,in the lobby,tempslibres,3,4,4,False
7,overnight rain --,the scent of orange blossoms,in a desert town,tempslibres,4,7,5,False
...,...,...,...,...,...,...,...,...
143132,I'm not asking did,you say it nor clarify,what you said neither,twaiku,5,7,5,False
143133,You are truly a,moron or a liar I'm,inclined to think both,twaiku,5,7,5,False
143134,Ain't no selfie on,this earth that's gonna make me,like Theresa May,twaiku,5,7,5,False
143135,is doing a great,job turning Independents,into Democrats,twaiku,5,7,5,False


In [23]:
df_haiku["type"] = (
    df_haiku["0_syllables"] + df_haiku["1_syllables"] + df_haiku["2_syllables"]
)
df_haiku

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables,remove,type
0,Memorial Day --,a shadow for each,white cross,tempslibres,5,5,2,False,552
3,sunny afternoon,an old man lingers,near the mailbox,tempslibres,5,5,4,False,554
4,cinco de mayo,horses roll,in the shallows,tempslibres,5,3,4,False,534
5,quitting time,the smell of rain,in the lobby,tempslibres,3,4,4,False,344
7,overnight rain --,the scent of orange blossoms,in a desert town,tempslibres,4,7,5,False,475
...,...,...,...,...,...,...,...,...,...
143132,I'm not asking did,you say it nor clarify,what you said neither,twaiku,5,7,5,False,575
143133,You are truly a,moron or a liar I'm,inclined to think both,twaiku,5,7,5,False,575
143134,Ain't no selfie on,this earth that's gonna make me,like Theresa May,twaiku,5,7,5,False,575
143135,is doing a great,job turning Independents,into Democrats,twaiku,5,7,5,False,575


In [27]:
df_haiku["type"].value_counts()

575       93390
354         438
344         412
101010      397
353         375
          ...  
6103          1
8613          1
419           1
11102         1
9414          1
Name: type, Length: 1796, dtype: int64

In [30]:
data_module = DataModule()

2020-10-21 20:28:50.186 | INFO     | funcs.haiku_data_module:__init__:46 - data module hparams: {'max_tokenization_length': 128, 'batch_size': 16, 'num_workers': 2}


In [31]:
data_module.setup()

2020-10-21 20:28:57.075 | INFO     | funcs.haiku_data_module:setup:49 - Loading train dataset
2020-10-21 20:28:57.078 | INFO     | funcs.haiku_data_module:get_dataset:88 - get dataset
2020-10-21 20:29:02.773 | INFO     | funcs.haiku_data_module:get_dataset:93 - cache to path: /work/ik18445/projects/yapg/datasets/output/haiku_128.pt


In [33]:
data_loader = data_module.train_dataloader()
print(len(data_loader))
IDX = 2
for idx, batch in enumerate(data_loader):
    if idx == IDX:
        data_batch = batch
        break
data_batch

5837


[tensor([[  101,  1037,  3634,  ...,     0,     0,     0],
         [  101,  1045,  2387,  ...,     0,     0,     0],
         [  101,  2255, 10434,  ...,     0,     0,     0],
         ...,
         [  101,  2002,  4332,  ...,     0,     0,     0],
         [  101,  2016, 12842,  ...,     0,     0,     0],
         [  101, 15544, 19722,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]])]