In [1]:
import parser
from analyse import *
from nltk.corpus import wordnet as wn

<h2>Cleaning input text</h2>

In [7]:
#Read in text
f = open('byron.txt','rb')
text = f.readlines()

print 'The initial input text is:'
print text 
print '\n\n\n'

#Removes punctuation and turn into a list of words
text = parser.split_text(text)
#Removes the most common words
text = parser.remove_stopwords(text)

print 'After cleaning:'
print text

The initial input text is:
['She walks in beauty, like the night \n', 'Of cloudless climes and starry skies; \n', 'And all that\xe2\x80\x99s best of dark and bright \n', 'Meet in her aspect and her eyes; \n', 'Thus mellowed to that tender light \n', 'Which heaven to gaudy day denies. \n', '\n', 'One shade the more, one ray the less, \n', 'Had half impaired the nameless grace \n', 'Which waves in every raven tress, \n', 'Or softly lightens o\xe2\x80\x99er her face; \n', 'Where thoughts serenely sweet express, \n', 'How pure, how dear their dwelling-place. \n', '\n', 'And on that cheek, and o\xe2\x80\x99er that brow, \n', 'So soft, so calm, yet eloquent, \n', 'The smiles that win, the tints that glow, \n', 'But tell of days in goodness spent, \n', 'A mind at peace with all below, \n', 'A heart whose love is innocent!']




After cleaning:
['', 'WALKS', '', 'BEAUTY', '', '', 'NIGHT', '', 'CLOUDLESS', 'CLIMES', '', 'STARRY', 'SKIES', '', '', '', 'BEST', '', 'DARK', '', 'BRIGHT', 'MEET', ''

<h2>Forming synsets</h2>

In [28]:
# Wordnet is the database for words and lemmas. 
# Lists are stored in my home directory
from nltk.corpus import wordnet as wn
word = 'DARK'

print 'Lemmas for word ' + word + ":"
print get_lemmaset(word)
print "\n\n"

# Morphy is the sort of base for a word
word = 'DARKENED'
print 'Morphy for word ' + word + ":"
print wn.morphy(word.lower())
word = 'DARKENS'
print 'Morphy for word ' + word + ":"
print wn.morphy(word.lower())

Lemmas for word DARK:
set([u'dour', u'morose', u'dark-skinned', u'saturnine', u'glum', u'benighted', u'wickedness', u'blue', u'sullen', u'darkness', u'disconsolate', u'black', u'sorry', u'dreary', u'coloured', u'dark', u'gloomy', u'dismal', u'drab', u'shadow', u'grim', u'moody', u'drear', u'colored', u'iniquity', u'sinister', u'nighttime', u'dingy', u'sour', u'night', u'non-white', u'obscure', u'glowering'])



Morphy for word DARKENED:
darken
Morphy for word DARKENS:
darken


In [18]:
# Pydictionary is a less sophisticated version of synsets
from PyDictionary import PyDictionary
dic = PyDictionary()
word = 'DARK'
synonyms = dic.synonym(word)
print "Synonyms for " + word + " from PyDictionary:" 
print synonyms

Synonyms for DARK from PyDictionary:
[u'dim', u'misty', u'murky', u'shadowy', u'overcast']


In [23]:
word = 'DARKNESS'
print 'Synset for word ' + word + ':'
print get_synset(word, dic)
print '\n\n'

print 'Recursive synset for word ' + word + ':'


Synset for word DARKNESS:
['darkness', u'dark', u'dim', u'misty', u'murky', u'shadowy', u'overcast', u'iniquity', u'heinousness', u'infamy', u'immorality', u'abomination', u'baseness']


<h3>Analysing a set </h3>

In [4]:
text = test_text()
word_dic, marks = mark_words(text, getset = 'synset')

In [7]:
for key in word_dic.keys():
    ls = []
    for word in word_dic[key]:
        if len(word) > 0:
            if word[0].isupper():
                ls.append(word)
    print ls

['INNOCENT']
['BROW']
['ELOQUENT']
['WIN']
['PEACE']
['BEAUTY', 'ASPECT', 'SHADE', 'GRACE', 'FACE']
['WALKS']
['CLOUDLESS', 'SKIES', 'BRIGHT', 'MEET', 'LIGHT', 'THOUGHTS', 'DAYS']
['NIGHT', 'DARK', 'TINTS']
['STARRY']
['CLIMES']
['EYES', 'MIND', 'HEART']
['BEST', 'CALM', 'GOODNESS', 'LOVE']
['RAY', 'LIGHTENS', 'SMILES', 'GLOW']
['MELLOWED', 'TENDER', 'SWEET', 'SOFT']
['IMPAIRED']
['HALF', 'RAVEN']
['WAVES']
['NAMELESS']
['SOFTLY']
['TRESS']
['SERENELY']
['OER']
['CHEEK']
['EXPRESS', 'SPENT']
[]


In [8]:
def main_key(word_dic, marks, text):
    main_key_list = [word_dic[str(i)][0] for i in marks]
    zipped = zip(main_key_list, text)
    z = [x for x in zipped if x[0] != '']
    # make it so that it counts the number of time                                                    
    return z

In [9]:
text = test_text()
word_dic, marks = mark_words(text, getset = 'lemmas')
for key in word_dic.keys():
    ls = []
    for word in word_dic[key]:
        if len(word) > 1:
            if word[1].isupper():
                ls.append(word)
    print ls

['LIGHTENS']
['OER']
['SERENELY']
['SWEET']
['WAVES']
['RAVEN']
['TRESS']
['SOFTLY', 'SOFT']
['EXPRESS']
['BROW']
['BEAUTY']
['WALKS', 'SPENT']
['CLOUDLESS', 'LIGHT']
['NIGHT', 'DARK']
['STARRY']
['CLIMES']
['BEST', 'GOODNESS', 'LOVE']
['SKIES']
['MEET', 'TINTS']
['BRIGHT']
['EYES', 'HEART']
['ASPECT', 'FACE', 'THOUGHTS', 'CHEEK']
['TENDER']
['MELLOWED']
['RAY', 'GLOW']
['SHADE']
['IMPAIRED']
['HALF']
['GRACE']
['NAMELESS']
['ELOQUENT']
['CALM']
['INNOCENT']
['PEACE']
['MIND']
['DAYS']
['WIN']
[]
['SMILES']


<h3>Path similarity</h3>

In [None]:
test_group = ['CLOUDLESS', 'SKIES', 'BRIGHT', 'MEET', 'LIGHT', 'THOUGHTS', 'DAYS']