In [1]:
import pandas as pd
import nltk
from collections import Counter
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\damia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
NUM_SEED_WORDS = 50
REMOVED = {'i','thou','thee', 'o', '\'t', 'ah', 'ye', 'someone', 'till', 'oh', 'come', '~' ,'thy', 'thing', 'something', 'shit', 'everyone', 'everything', 'person', 'ass', 'anyone', 'lol', 'im', 'fuck', 'anything', 'gon', 'bitch', 'twitter', 'lot', 'cause'}

In [3]:
haiku_path = '../input/haikus.csv'

In [4]:
# DataFrame of the haiku dataset
# cols: [0, 1, 2, source, 0_syllables, 1_syllables, 2_syllables]
df = pd.read_csv(haiku_path)
df = df[df['source'].isin(('tempslibres', 'haikuzao', 'sballas'))]
df = df.drop(columns='source')
df = df.drop(columns=['%s_syllables' % i for i in range(3)])


In [5]:
nouns = Counter()
noun_phrases = Counter()

for i in tqdm(range(len(df))):
  for j in range(3):
    line = str(df[str(j)].iloc[i])
    tokens = nltk.word_tokenize(line)
    
    prev_pos = None
    prev_tok = None
    for tok, pos in nltk.pos_tag(tokens):
      tok = tok.lower()
      if pos == 'NN':
        if prev_pos == 'JJ':
          noun_phrases[f'{prev_tok} {tok}'] += 1
        nouns[tok] += 1
      prev_pos = pos
      prev_tok = tok

100%|██████████| 14078/14078 [01:54<00:00, 123.35it/s]


In [6]:
def get_most_frequent(dictionary):
  return list(sorted(dictionary.items(), key=lambda item: item[1], reverse=True))

In [7]:
def remove_words(corpus, to_remove):
  return [w for w in corpus if w not in to_remove]

In [8]:
get_most_frequent(noun_phrases)[:50]

[('full moon', 101),
 ('new year', 69),
 ('indian summer', 47),
 ('new moon', 30),
 ('distant thunder', 26),
 ('last night', 24),
 ('blue sky', 22),
 ('old man', 21),
 ('cold rain', 21),
 ('cold night', 21),
 ('deep winter', 20),
 ('low tide', 18),
 ('warm day', 18),
 ('last day', 17),
 ('last year', 17),
 ('early spring', 15),
 ('soft rain', 15),
 ('milky way', 15),
 ('late summer', 14),
 ('open window', 14),
 ('fresh snow', 14),
 ('high tide', 14),
 ('last light', 13),
 ('cold morning', 13),
 ('small town', 13),
 ('old dog', 13),
 ('deep autumn', 12),
 ('last time', 12),
 ('rainy day', 11),
 ('little girl', 11),
 ('blue heron', 11),
 ('cold moon', 11),
 ('memorial day', 11),
 ('late afternoon', 10),
 ('gibbous moon', 10),
 ('long night', 10),
 ('other side', 10),
 ('old cat', 10),
 ('old friend', 9),
 ('cool morning', 9),
 ('old pond', 9),
 ('first time', 9),
 ('stray dog', 9),
 ('white butterfly', 9),
 ('steady rain', 9),
 ('heavy rain', 9),
 ('hot afternoon', 8),
 ('small talk', 8)

In [9]:
seed_words = get_most_frequent(nouns)
seed_words = [e[0] for e in seed_words]
seed_words = remove_words(seed_words, REMOVED)
seed_words = seed_words[:NUM_SEED_WORDS]
print(seed_words)


['moon', 'rain', 'morning', 'night', 'summer', 'winter', 'day', 'spring', 'autumn', 'wind', 'sky', 'snow', 'sun', 'light', 'window', 'end', 'scent', 'shadow', 'dog', 'sound', 'heat', 'fog', 'home', 'year', 'river', 'garden', 'afternoon', 'dusk', 'tree', 'sunset', 'breeze', 'song', 'cat', 'smell', 'dawn', 'water', 'storm', 'time', 'way', 'evening', 'grass', 'silence', 'tea', 'mother', 'nan', 'mist', 'leaf', 'house', 'child', 'blue']


In [10]:
with open("seed_words.txt", "w") as f:
  for sw in seed_words:
    f.write(str(sw) + "\n")