In [1]:
import pandas as pd

In [2]:
haiku_path = '../data/raw/haikus.csv'

# eventually would be cool to load this from a config/hyperparam json file
hyperparams = {
    'preprocessing': {
        'max_line_len_quantile': 0.9
    },
    'model': {
        'latent_dim': 2048,
        'epochs': 20,
        'learning_rate': 0.01
    }
    
}

In [3]:
# DataFrame of the haiku dataset
# cols: [0, 1, 2, 0_syllables, 1_syllables, 2_syllables]
df = pd.read_csv(haiku_path)
df = df.drop(columns='source')

In [4]:
# Duplicate lines with ambiguous syllable counts
# i.e. syllable counts with a comma because there exists multiple pronunciations
for i in range(3):
  col = '%s_syllables' % i
  df[col] = df[col].str.split(',', expand=False)
  df = df.explode(col)
print(df)

                         0                                 1  \
0          Memorial Day --                 a shadow for each   
1            spring rain -              as the doctor speaks   
1            spring rain -              as the doctor speaks   
2        spring moonset --                   a rice ball for   
2        spring moonset --                   a rice ball for   
...                    ...                               ...   
143132  I'm not asking did            you say it nor clarify   
143133     You are truly a               moron or a liar I'm   
143134  Ain't no selfie on   this earth that's gonna make me   
143135    is doing a great          job turning Independents   
143136    Wanted to send a         quick follow up on if the   

                              2 0_syllables 1_syllables 2_syllables  
0                   white cross           5           5           2  
1             i think of lilacs           2           5           5  
1             i think

In [5]:
# Drop samples that have lines longer than 90th percentile
# may want to modify this value and see how it changes
quantile = hyperparams['preprocessing']['max_line_len_quantile']
max_len = int(max([df[str(i)].str.len().quantile(quantile) for i in range(3)]))
df = df[
  (df['0'].str.len() < max_len) & 
  (df['1'].str.len() < max_len) & 
  (df['2'].str.len() < max_len)
]
print(df)

                         0                                 1  \
0          Memorial Day --                 a shadow for each   
1            spring rain -              as the doctor speaks   
1            spring rain -              as the doctor speaks   
2        spring moonset --                   a rice ball for   
2        spring moonset --                   a rice ball for   
...                    ...                               ...   
143132  I'm not asking did            you say it nor clarify   
143133     You are truly a               moron or a liar I'm   
143134  Ain't no selfie on   this earth that's gonna make me   
143135    is doing a great          job turning Independents   
143136    Wanted to send a         quick follow up on if the   

                              2 0_syllables 1_syllables 2_syllables  
0                   white cross           5           5           2  
1             i think of lilacs           2           5           5  
1             i think