In [1]:
import xmltodict
import json
import collections
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

# import WordNet Lemmatizer
lmtzr = WordNetLemmatizer()

with open ("dreambank-public.xml") as f:
    doc = xmltodict.parse(f.read())

def convert(data):
    if isinstance(data, basestring):
        return str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convert, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convert, data))
    else:
        return data

def left(s, amount):
    return s[:amount]

def right(s, amount):
    return s[-amount:]

def mid(s, offset, amount):
    return s[offset:offset+amount]

# converts long POS to shortened POS
# used for WordNet lemmatizer
def get_short_pos(long_pos):
    if long_pos.startswith('J'):     # adjective
        return 'a'
    elif long_pos.startswith('V'):   # verb
        return 'v'
    elif long_pos.startswith('N'):   # noun
        return 'n'
    elif long_pos.startswith('R'):   # adverb
        return 'r'
    else:
        return 'n'                   # returns noun for now, but we can update/fix later.


In [2]:
for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' 
    print '  ID: ' + dreamer['id']
    print '  type: ' + dreamer['type']
    print '  sex: ' + dreamer['sex']
    print '  age: ' + dreamer['age']
    
    try:
        print '  time: ' + dreamer['time']
    except:
        pass
    
    print '  sample dream: ' 
    # print '    ' + (json.dumps(dreamer['dream'][0], indent=4))
    
    odict = dreamer['dream'][0]
    for key, value in odict.items():
        if convert(key) == 'report':
            print '    report: ' + left(convert(value), 200) + '...'
            print '    lemmatized report: ',
            for word in filter(None, left(convert(value), 200).split(" ")):
                token = nltk.tokenize.word_tokenize(word)
                long_pos = nltk.pos_tag(token)
                short_pos = get_short_pos(long_pos[0][1])
                print lmtzr.lemmatize(word, pos=short_pos),
        else:
            print '    ' + convert(key) + ': ' + str(convert(value))
        
    print '\n'

Alta: a detailed dreamer (422 dreams)
  ID: alta
  type: series
  sex: F
  age: A
  time: 1985-1997
  sample dream: 
    number: 1
    date: 1957
    report: The one at the Meads's house, where it's bigger inside than out; there's a European village just inside, with a cobblestone street and a Pied-Piper sort of man with curly hair, he can do things like j...
    lemmatized report:  The one at the Meads's house, where it's big inside than out; there's a European village just inside, with a cobblestone street and a Pied-Piper sort of man with curly hair, he can do thing like j 

Angie: age 18 & 20 (48 dreams)
  ID: angie
  type: series
  sex: F
  age: Y
  time: 1996
  sample dream: 
    number: 1-01
    date: 1996-04-03
    report: My memory of this dream is vague. I think the setting is on a college campus. I'm in a cafe and two elderly ladies walk in and start talking to me about a university that a guy I am dating got into fo...
    lemmatized report:  My memory of this dream be vagu

    lemmatized report:  I remember mainly get out of bed to go to my mother's room. The first time I be distract by a meteorite see through the window I passed, and so I return to bed. 

Ed: dreams of his late wife (143 dreams)
  ID: ed
  type: series
  sex: M
  age: A
  time: 1980-2002
  sample dream: 
    number: 001
    date: 07/??/80
    codings: {'char': '1FWA', 'fri': [{'rec': 'D', 'init': '1FWA', 'code': '2>'}, {'rec': '1FWA', 'init': 'D', 'code': '1>'}]}
    report: I see Mary in profile. She looks as lovely as she had before the illness, and when she was younger. At one point in the dream I also see a profile view of her when she and Maria press their cheeks tog...
    lemmatized report:  I see Mary in profile. She look a lovely a she have before the illness, and when she be younger. At one point in the dream I also see a profile view of her when she and Maria press their cheek tog 

Edna: a blind woman (19 dreams)
  ID: edna
  type: series
  sex: F
  age: Y
  time: 1948-1949


    lemmatized report:  There was-a white, nylon windbreaker that be hang on a chair, almost a if on display. It be of an old style, maybe from the 1970s or 8Os and of a generic cheap quality. I note that I like it a 

Mark: a young boy (23 dreams)
  ID: mark
  type: series
  sex: M
  age: C
  time: 1997-1999
  sample dream: 
    number: 01
    date: October, 1997
    report: I was at the park. I had these little cars in my hand, I was riding a bicycle. The I crashed into a car. The cars in my hand got bigger and I got in them. This little girl had some in her hand and wan...
    lemmatized report:  I be at the park. I have these little car in my hand, I be rid a bicycle. The I crashed into a car. The car in my hand get big and I get in them. This little girl have some in her hand and wan 

Melissa: a young girl (89 dreams)
  ID: melissa
  type: series
  sex: F
  age: C
  time: 1998-2000
  sample dream: 
    number: 1-01
    date: 1998-03-25
    report: I dreamed that a tiger named Shi

    lemmatized report:  Uncle Albert be get ready to leave for a vacation and be take me along. I don't want to go, and I tell Uncle Albert that someone be come by to get me at 11:00 a.m., but it make no difference 

Phil 2: late 20s (220 dreams)
  ID: phil2
  type: series
  sex: M
  age: A
  time: 1971
  sample dream: 
    number: 2-001
    date: 1971-01-01
    report: I was speaking to Kathy Reynault at night in front of her house (but not her real house).  Her mother was nearby, and I either realized she knew who I was or Kathy actually introduced us, and she didn...
    lemmatized report:  I be speak to Kathy Reynault at night in front of her house (but not her real house). Her mother be nearby, and I either realize she knew who I be or Kathy actually introduce us, and she didn 

Phil 3: retirement (180 dreams)
  ID: phil3
  type: series
  sex: M
  age: A
  time: 2004
  sample dream: 
    number: 3-001
    date: 2004-01-08
    report: I am at some kind of resort with a few people I

    lemmatized report:  I'm in the Army, I have be to war, and now be in a large military warehouse. I'm climb up a huge pile of box that be packed with thousand of small black toy trains. An MP come by and harasse 



In [3]:
# some simple examples to demonstrate lemmatizer
# can be deleted
print lmtzr.lemmatize('cars')
print lmtzr.lemmatize('feet')
print lmtzr.lemmatize('people')
print lmtzr.lemmatize('dating')
print lmtzr.lemmatize('dating', pos = 'v')  # v for verb

car
foot
people
dating
date
