In [2]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

def preprocess_text(text):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  tokenized = word_tokenize(cleaned)
  normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized])
  return normalized

In [3]:
poem_1 = '''
Success is counted sweetest
By those who ne'er succeed.
To comprehend a nectar
Requires sorest need.

Not one of all the purple host
Who took the flag to-day
Can tell the definition,
So clear, of victory,

As he, defeated, dying,
On whose forbidden ear
The distant strains of triumph
Break, agonized and clear!'''

poem_2 = '''
Wild nights! Wild nights!
Were I with thee,
Wild nights should be
Our luxury!

Futile the winds
To a heart in port, —
Done with the compass,
Done with the chart.

Rowing in Eden!
Ah! the sea!
Might I but moor
To-night in thee!'''

poem_3 = '''
I'm nobody! Who are you?
Are you nobody, too?
Then there 's a pair of us — don't tell!
They 'd banish us, you know.

How dreary to be somebody!
How public, like a frog
To tell your name the livelong day
To an admiring bog!'''

poem_4 = '''
I felt a funeral in my brain,
   And mourners, to and fro,
Kept treading, treading, till it seemed
   That sense was breaking through.

And when they all were seated,
   A service like a drum
Kept beating, beating, till I thought
   My mind was going numb.

And then I heard them lift a box,
   And creak across my soul
With those same boots of lead, again.
   Then space began to toll

As all the heavens were a bell,
   And Being but an ear,
And I and silence some strange race,
   Wrecked, solitary, here.'''

poem_5 = '''
Hope is the thing with feathers
That perches in the soul,
And sings the tune without the words,
And never stops at all,

And sweetest in the gale is heard;
And sore must be the storm
That could abash the little bird
That kept so many warm.

I 've heard it in the chillest land,
And on the strangest sea;
Yet, never, in extremity,
It asked a crumb of me.'''

poem_6 = '''
The pedigree of honey
Does not concern the bee;
A clover, any time, to him
Is aristocracy.'''

poems = [poem_1, poem_2, poem_3, poem_4, poem_5, poem_6]

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
#from preprocessing import preprocess_text
#from poems import poems

# preprocess text
processed_poems = [preprocess_text(poem) for poem in poems]

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform(processed_poems)
# my bAdded
#print(processed_poems)
# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Poem {i+1}" for i in range(len(poems))]

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=corpus_index)

In [5]:
#import codecademylib3_seaborn
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
#from term_frequency import term_frequencies, feature_names, df_term_frequencies

# display term-document matrix of term frequencies
print(df_term_frequencies)

# initialize and fit TfidfTransformer
transformer = TfidfTransformer()
transformer.fit(term_frequencies)


# create pandas DataFrame with inverse document frequencies
try:
  df_idf = pd.DataFrame(idf_values, index = feature_names, columns=['Inverse Document Frequency'])
  print(df_idf)
except:
  pass

         Poem 1  Poem 2  Poem 3  Poem 4  Poem 5  Poem 6
abash         0       0       0       0       1       0
across        0       0       0       1       0       0
admire        0       0       1       0       0       0
again         0       0       0       1       0       0
agonize       1       0       0       0       0       0
...         ...     ...     ...     ...     ...     ...
word          0       0       0       0       1       0
wreck         0       0       0       1       0       0
yet           0       0       0       0       1       0
you           0       0       3       0       0       0
your          0       0       1       0       0       0

[173 rows x 6 columns]


In [6]:
# for demo
print(processed_poems)
print(term_frequencies)

['success be count sweet by those who ne er succeed to comprehend a nectar require sorest need not one of all the purple host who take the flag to day can tell the definition so clear of victory a he defeat die on whose forbid ear the distant strain of triumph break agonize and clear', 'wild night wild night be i with thee wild night should be our luxury futile the wind to a heart in port do with the compass do with the chart rowing in eden ah the sea might i but moor to night in thee', 'i m nobody who be you be you nobody too then there s a pair of u don t tell they d banish u you know how dreary to be somebody how public like a frog to tell your name the livelong day to an admire bog', 'i felt a funeral in my brain and mourner to and fro keep tread tread till it seem that sense be break through and when they all be seat a service like a drum keep beat beat till i think my mind be go numb and then i hear them lift a box and creak across my soul with those same boot of lead again then 

In [10]:
woodchuck = 'How much wood would a woodchuck chuck if a woodchuck could chuck wood?'
vectorizer_dem = CountVectorizer()
term_freq_dem = vectorizer_dem.fit_transform([woodchuck])
print(term_freq_dem)

  (0, 2)	1
  (0, 4)	1
  (0, 5)	2
  (0, 7)	1
  (0, 6)	2
  (0, 0)	2
  (0, 3)	1
  (0, 1)	1
