In [1]:
import xmltodict
import json
import collections
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter

# import WordNet Lemmatizer
lmtzr = WordNetLemmatizer()

with open ("dreambank-public.xml") as f:
    doc = xmltodict.parse(f.read())

def convert(data):
    if isinstance(data, basestring):
        return str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convert, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convert, data))
    else:
        return data

def left(s, amount):
    return s[:amount]

def right(s, amount):
    return s[-amount:]

def mid(s, offset, amount):
    return s[offset:offset+amount]

In [2]:
for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' 
    print '  ID: ' + dreamer['id']
    print '  type: ' + dreamer['type']
    print '  sex: ' + dreamer['sex']
    print '  age: ' + dreamer['age']
    
    try:
        print '  time: ' + dreamer['time']
    except:
        pass
    
    print '  sample dream: ' 
    # print '    ' + (json.dumps(dreamer['dream'][0], indent=4))
    
    odict = dreamer['dream'][0]
    for key, value in odict.items():
        if convert(key) == 'report':
            print '    report: ' + left(convert(value), 200) + '...'
            print '    lemmatized report: ',
        else:
            print '    ' + convert(key) + ': ' + str(convert(value))
        
    print '\n'

Alta: a detailed dreamer (422 dreams)
  ID: alta
  type: series
  sex: F
  age: A
  time: 1985-1997
  sample dream: 
    number: 1
    date: 1957
    report: The one at the Meads's house, where it's bigger inside than out; there's a European village just inside, with a cobblestone street and a Pied-Piper sort of man with curly hair, he can do things like j...
    lemmatized report:  

Angie: age 18 & 20 (48 dreams)
  ID: angie
  type: series
  sex: F
  age: Y
  time: 1996
  sample dream: 
    number: 1-01
    date: 1996-04-03
    report: My memory of this dream is vague. I think the setting is on a college campus. I'm in a cafe and two elderly ladies walk in and start talking to me about a university that a guy I am dating got into fo...
    lemmatized report:  

Arlie: a middle-aged woman (212 dreams)
  ID: arlie
  type: series
  sex: F
  age: A
  time: 1992-1998
  sample dream: 
    number: 1
    date: 10/14/92
    report: I am in an office in the town next to the town I grew up in. 

    lemmatized report:  

Miami Home-Lab: Home (171 dreams)
  ID: miami-home
  type: set
  sex: M
  age: Y
  time: 1963-1965
  sample dream: 
    number: Bart-H-1
    date: 12/19/63
    report: I dreamt I was at a party. Everyone was having a good time except the hostess. She hated her husband and wanted to kill him. I remember helping her look for something to throw at her husband. We were ...
    lemmatized report:  

Miami Home-Lab: Lab (274 dreams)
  ID: miami-lab
  type: set
  sex: M
  age: Y
  time: 1963-1965
  sample dream: 
    number: Bart-A-1
    date: 01/31/64
    report: I was going to Georgia and I remembered the tune, the song, "I'm going back to Georgia." I heard the last few days on the radio. I was humming the song to myself when I was going back. I was sitting i...
    lemmatized report:  

Melora (Melvin's wife) (211 dreams)
  ID: melora
  type: series
  sex: F
  age: Y
  time: 1962
  sample dream: 
    number: 001
    date: 06/19/62
    report: We were in a used car

In [3]:
print '---Dream collections from individuals---' + '\n'
MultIDs = ['b', 'madeline1-hs', 'madeline2-dorms', 'madeline3-offcampus', 'phil1', 'phil2', 'vietnam_vet']
NumberOfSeries = 0

for dreamer in doc['dreambank']['collection']:
    if dreamer['type'] == 'series':
        print '{' + dreamer['id']  + '} ' + dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'   
        
        if dreamer['id'] not in MultIDs:
            print '\n'
            NumberOfSeries += 1
        
print "Total Number of individuals to test vs. 'others': " + str(NumberOfSeries)

---Dream collections from individuals---

{alta} Alta: a detailed dreamer (422 dreams) [F]


{angie} Angie: age 18 & 20 (48 dreams) [F]


{arlie} Arlie: a middle-aged woman (212 dreams) [F]


{b} Barb Sanders (3116 dreams) [F]
{b2} Barb Sanders #2 (1138 dreams) [F]


{bosnak} Robert Bosnak: A dream analyst (53 dreams) [M]


{chris} Chris: a transvestite (100 dreams) [M]


{chuck} Chuck: a physical scientist (75 dreams) [M]


{dahlia} Dahlia: concerns with appearance (24 dreams) [F]


{david} David: teenage dreams (166 dreams) [M]


{dorothea} Dorothea: 53 years of dreams (900 dreams) [F]


{ed} Ed: dreams of his late wife (143 dreams) [M]


{edna} Edna: a blind woman (19 dreams) [F]


{elizabeth} Elizabeth: a woman in her 40s (1707 dreams) [F]


{emma} Emma: 48 years of dreams (1521 dreams) [F]


{emmas_husband} Emma's Husband (72 dreams) [M]


{esther} Esther: an adolescent girl (110 dreams) [F]


{izzy} Izzy (all) (4352 dreams) [F]


{jasmine} Jasmine (all) (664 dreams) [F]


{jeff} 

In [4]:
# reduce to noun-only and lemmatize
for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'
    
    print '  noun-only sample dream: ' 
    
    odict = dreamer['dream'][0]
    
    # (1) tokenize dream text
    # (2) tag with PoS
    # (3) reduce to noun-only
    # (4) lemmatize nouns
    # (5) get frequency counts of the lemmatized nouns
    for key, value in odict.items():
        if convert(key) == 'report':
            text = nltk.tokenize.word_tokenize(value)
            pos = nltk.pos_tag(text)
            noun_only = [w[0] for w in pos if w[1].startswith('N')]
            lmtz_noun_only = [lmtzr.lemmatize(word) for word in noun_only]
            counts = Counter(lmtz_noun_only)
            print counts
        
    print '\n'

Alta: a detailed dreamer (422 dreams) [F]
  noun-only sample dream: 
Counter({u'house': 3, u'hair': 2, u'[': 2, u'room': 2, u'bridge': 1, u'set': 1, u'creek': 1, u'people': 1, u'stove': 1, u'one': 1, u'street': 1, u'village': 1, u'corner': 1, u'blonde': 1, u'string': 1, u'pageboy': 1, u'balloon': 1, u'juggle': 1, u'couple': 1, u'sort': 1, u'hallway': 1, u'woman': 1, u'Meads': 1, u'Inside': 1, u'cobblestone': 1, u']': 1, u'man': 1, u'drive': 1, u'round': 1, u'thing': 1, u'aunt': 1, u'side': 1})


Angie: age 18 & 20 (48 dreams) [F]
  noun-only sample dream: 
Counter({u'dream': 3, u'school': 2, u'guy': 2, u'lady': 2, u'hospital': 1, u'information': 1, u'feeling': 1, u'art': 1, u'orientation': 1, u'university': 1, u'cafe': 1, u'setting': 1, u'college': 1, u'memory': 1, u'law': 1, u'campus': 1})


Arlie: a middle-aged woman (212 dreams) [F]
  noun-only sample dream: 
Counter({u'town': 2, u'toilet': 2, u'bathroom': 1, u'Everyone': 1, u'office': 1, u'rest': 1})


Barb Sanders (3116 dreams) [F

  noun-only sample dream: 
Counter({u'girl': 4, u'school': 2, u'sort': 1, u'love': 1, u'Just': 1, u'advantage': 1, u'spot': 1, u'event': 1, u'geek': 1, u'thing': 1, u'moment': 1, u'her': 1, u'opinion': 1, u'guy': 1, u'cafe': 1, u'dream': 1, u'nerd': 1})


Joan: a lesbian (42 dreams) [F]
  noun-only sample dream: 
Counter({u'group': 3, u'picture': 2, u'Eliza': 2, u'A': 1, u'building': 1, u'everyone': 1, u'stair': 1, u'people': 1, u'motion': 1, u'front': 1, u'campground': 1, u'Hurry': 1, u'seminar': 1})


Kenneth (2022 dreams) [M]
  noun-only sample dream: 
Counter({u'<': 4, u'>': 3, u'room': 3, u'Ned': 2, u'BR': 2, u'table': 2, u'Major': 1, u'Stallone': 1, u'/I': 1, u'Hall': 1, u'Dormitory': 1, u'guard': 1, u'corner': 1, u'cream': 1, u'enters': 1, u'bowl': 1, u'ice': 1, u'chunk': 1, u'friend': 1, u'Room': 1, u'sitting': 1, u'stop': 1, u'Disease': 1, u'fruit': 1, u'mouth': 1, u'television': 1, u'mask': 1, u'disease': 1, u'rectangular': 1, u'classmate': 1, u'security': 1, u'Kevin': 1})




Counter({u'=': 8, u'band': 4, u'Sam': 4, u'friend': 4, u'trail': 4, u'soup': 4, u'Kim': 4, u'picture': 4, u'Ahmad': 3, u'Ronny': 3, u'lot': 3, u'father': 2, u'everyone': 2, u'parking': 2, u'album': 2, u'stuff': 2, u'Eric': 2, u'K.': 2, u'size': 2, u'way': 2, u'Roger': 2, u'tree': 2, u'Crush': 2, u'i': 2, u'Street': 2, u'mother': 2, u'slope': 1, u'photo': 1, u'course': 1, u'bedroom': 1, u'chicken': 1, u'geography': 1, u'lb': 1, u'dad': 1, u'woman': 1, u'song': 1, u'break': 1, u'mom': 1, u'Hmmmm': 1, u'lady': 1, u'day': 1, u'meat': 1, u'brother': 1, u'race': 1, u'affect': 1, u'quarter': 1, u'clothes': 1, u'bannister': 1, u'street': 1, u'college': 1, u'girl': 1, u'closet': 1, u'caveman': 1, u'Nobody': 1, u'hill': 1, u'cause': 1, u'parent': 1, u'outfit': 1, u'path': 1, u'boy': 1, u'ol': 1, u'thread': 1, u'thing': 1, u'hiking': 1, u'river': 1, u'Hollywood': 1, u'interest': 1, u'area': 1, u'top': 1, u'church/bar': 1, u'Feels': 1, u'time': 1, u'curtain': 1, u'Onion': 1, u'hour': 1, u'bed': 1,