In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
with open('haikus_all.txt') as f:
    haikus = f.readlines()

In [7]:
haikus = [x.replace("\n","") for x in haikus]

In [9]:
haikus[:10]

[" on new year's day a cute little pilgrim at the gate",
 " new year's day this world of japan's blossoms",
 " basking in the new year's sun  my trashy hut",
 " around noon new year's day begins  little hut",
 " no run of the mill new year's day for the slob",
 ' first month second day  my wrinkled hands',
 ' first month  recording the cash spent on sake',
 " on the cat's grave in first month  dried sardines",
 " to one side of my paper lantern  spring's first dawn",
 " at my hut what will come of it  spring's first dawn"]

### Vectorizing Haikus and Data Prep

In [10]:
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(haikus)

In [11]:
doc_word.shape

(14999, 10188)

In [12]:
pd.DataFrame(doc_word.toarray(), index=haikus, columns=vectorizer.get_feature_names()).head(10)

Unnamed: 0,00,000,06,10,100,100th,10th,11,11th,12th,...,zijderoute,zip,zombie,zone,zoni,zoo,zucchini,zuigan,émeutes,équilibre
on new year's day a cute little pilgrim at the gate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
new year's day this world of japan's blossoms,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
basking in the new year's sun my trashy hut,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
around noon new year's day begins little hut,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
no run of the mill new year's day for the slob,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
first month second day my wrinkled hands,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
first month recording the cash spent on sake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
on the cat's grave in first month dried sardines,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
to one side of my paper lantern spring's first dawn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
at my hut what will come of it spring's first dawn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### LSA Model

In [17]:
lsa = TruncatedSVD(3)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.0078193 , 0.01156697, 0.0105899 ])

In [21]:
topic_word = pd.DataFrame(lsa.components_.round(3),
                         index=["t1", "t2", "t3"],
                         columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,00,000,06,10,100,100th,10th,11,11th,12th,...,zijderoute,zip,zombie,zone,zoni,zoo,zucchini,zuigan,émeutes,équilibre
t1,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0
t2,-0.0,0.0,-0.0,0.0,0.0,0.0,0.001,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0
t3,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.001,0.0,...,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0


In [22]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [25]:
display_topics(lsa, vectorizer.get_feature_names(), 50)


Topic  0
rain, moon, night, spring, blossoms, day, winter, year, new, mountain, cherry, autumn, summer, little, morning, plum, evening, rice, old, snow, cold, tree, wind, field, house, month, buddha, gate, cool, temple, harvest, long, village, like, leaves, home, sky, grass, man, pine, breeze, hut, just, water, falling, world, window, river, cuckoo, big

Topic  1
blossoms, cherry, plum, year, new, mountain, tree, old, buddha, world, little, scatter, blossom, day, bloom, fall, deutzia, trees, blooming, people, man, nightingale, temple, village, field, peach, lotus, child, shade, eyes, viewing, rice, frog, dog, edo, morning, scattering, horse, snow, evening, straw, japan, cat, mouth, butterfly, lucky, amid, sky, floating, amida

Topic  2
moon, night, summer, new, harvest, year, cold, cool, morning, sickle, gazing, bright, gate, pine, hut, autumn, crescent, old, short, sky, long, river, air, window, wind, white, half, just, sake, like, little, mosquito, tonight, stars, home, end, water, 

### NMF Model

In [33]:
nmf_model = NMF(2)
doc_topic2 = nmf_model.fit_transform(doc_word)

In [34]:
topic_word2 = pd.DataFrame(nmf_model.components_.round(3),
             index = ["t1","t2"],
             columns = vectorizer.get_feature_names())
topic_word2

Unnamed: 0,00,000,06,10,100,100th,10th,11,11th,12th,...,zijderoute,zip,zombie,zone,zoni,zoo,zucchini,zuigan,émeutes,équilibre
t1,0.006,0.0,0.003,0.0,0.0,0.0,0.001,0.0,0.003,0.0,...,0.0,0.0,0.0,0.002,0.006,0.002,0.0,0.001,0.002,0.0
t2,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
display_topics(nmf_model, vectorizer.get_feature_names(), 50)


Topic  0
rain, night, moon, spring, winter, day, new, year, summer, autumn, mountain, morning, cold, evening, snow, little, rice, wind, old, month, house, field, gate, harvest, cool, long, tree, grass, breeze, temple, leaves, home, sky, like, pine, buddha, hut, window, village, river, just, man, water, fifth, falling, cuckoo, geese, air, tea, big

Topic  1
blossoms, cherry, plum, mountain, tree, buddha, old, world, little, scatter, blossom, rice, field, year, evening, temple, fall, trees, bloom, village, blooming, man, deutzia, day, house, nightingale, dog, people, month, child, peach, horse, falling, shade, lotus, frog, gate, big, like, new, eyes, morning, viewing, water, edo, come, cat, clouds, face, scattering


In [46]:
topic_df = pd.DataFrame(doc_topic2.round(2),
             index = haikus,
             columns = ["seasons_date","nature"])
topic_df

Unnamed: 0,seasons_date,nature
on new year's day a cute little pilgrim at the gate,0.10,0.02
new year's day this world of japan's blossoms,0.07,0.18
basking in the new year's sun my trashy hut,0.06,0.01
around noon new year's day begins little hut,0.10,0.01
no run of the mill new year's day for the slob,0.08,0.01
first month second day my wrinkled hands,0.05,0.01
first month recording the cash spent on sake,0.01,0.00
on the cat's grave in first month dried sardines,0.02,0.01
to one side of my paper lantern spring's first dawn,0.06,0.00
at my hut what will come of it spring's first dawn,0.06,0.00
