In [1]:
import xmltodict
import json
import collections
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

# import WordNet Lemmatizer
lmtzr = WordNetLemmatizer()

with open ("dreambank-public.xml") as f:
    doc = xmltodict.parse(f.read())

def convert(data):
    if isinstance(data, basestring):
        return str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convert, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convert, data))
    else:
        return data

def left(s, amount):
    return s[:amount]

def right(s, amount):
    return s[-amount:]

def mid(s, offset, amount):
    return s[offset:offset+amount]

In [2]:
for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' 
    print '  ID: ' + dreamer['id']
    print '  type: ' + dreamer['type']
    print '  sex: ' + dreamer['sex']
    print '  age: ' + dreamer['age']
    
    try:
        print '  time: ' + dreamer['time']
    except:
        pass
    
    print '  sample dream: ' 
    # print '    ' + (json.dumps(dreamer['dream'][0], indent=4))
    
    odict = dreamer['dream'][0]
    for key, value in odict.items():
        if convert(key) == 'report':
            print '    report: ' + left(convert(value), 200) + '...'
            print '    lemmatized report: ',
        else:
            print '    ' + convert(key) + ': ' + str(convert(value))
        
    print '\n'

Alta: a detailed dreamer (422 dreams)
  ID: alta
  type: series
  sex: F
  age: A
  time: 1985-1997
  sample dream: 
    number: 1
    date: 1957
    report: The one at the Meads's house, where it's bigger inside than out; there's a European village just inside, with a cobblestone street and a Pied-Piper sort of man with curly hair, he can do things like j...
    lemmatized report:  

Angie: age 18 & 20 (48 dreams)
  ID: angie
  type: series
  sex: F
  age: Y
  time: 1996
  sample dream: 
    number: 1-01
    date: 1996-04-03
    report: My memory of this dream is vague. I think the setting is on a college campus. I'm in a cafe and two elderly ladies walk in and start talking to me about a university that a guy I am dating got into fo...
    lemmatized report:  

Arlie: a middle-aged woman (212 dreams)
  ID: arlie
  type: series
  sex: F
  age: A
  time: 1992-1998
  sample dream: 
    number: 1
    date: 10/14/92
    report: I am in an office in the town next to the town I grew up in. 

In [3]:
print '---Dream collections from individuals---' + '\n'
MultIDs = ['b', 'madeline1-hs', 'madeline2-dorms', 'madeline3-offcampus', 'phil1', 'phil2', 'vietnam_vet']
NumberOfSeries = 0

for dreamer in doc['dreambank']['collection']:
    if dreamer['type'] == 'series':
        print '{' + dreamer['id']  + '} ' + dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'   
        
        if dreamer['id'] not in MultIDs:
            print '\n'
            NumberOfSeries += 1
        
print "Total Number of individuals to test vs. 'others': " + str(NumberOfSeries)

---Dream collections from individuals---

{alta} Alta: a detailed dreamer (422 dreams) [F]


{angie} Angie: age 18 & 20 (48 dreams) [F]


{arlie} Arlie: a middle-aged woman (212 dreams) [F]


{b} Barb Sanders (3116 dreams) [F]
{b2} Barb Sanders #2 (1138 dreams) [F]


{bosnak} Robert Bosnak: A dream analyst (53 dreams) [M]


{chris} Chris: a transvestite (100 dreams) [M]


{chuck} Chuck: a physical scientist (75 dreams) [M]


{dahlia} Dahlia: concerns with appearance (24 dreams) [F]


{david} David: teenage dreams (166 dreams) [M]


{dorothea} Dorothea: 53 years of dreams (900 dreams) [F]


{ed} Ed: dreams of his late wife (143 dreams) [M]


{edna} Edna: a blind woman (19 dreams) [F]


{elizabeth} Elizabeth: a woman in her 40s (1707 dreams) [F]


{emma} Emma: 48 years of dreams (1521 dreams) [F]


{emmas_husband} Emma's Husband (72 dreams) [M]


{esther} Esther: an adolescent girl (110 dreams) [F]


{izzy} Izzy (all) (4352 dreams) [F]


{jasmine} Jasmine (all) (664 dreams) [F]


{jeff} 

In [4]:
# reduce to noun-only and lemmatize
for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'
    
    print '  noun-only sample dream: ' 
    
    odict = dreamer['dream'][0]
    
    # (1) tokenize dream text
    # (2) tag with PoS
    # (3) reduce to noun-only
    # (4) lemmatize nouns
    for key, value in odict.items():
        if convert(key) == 'report':
            text = nltk.tokenize.word_tokenize(value)
            pos = nltk.pos_tag(text)
            noun_only = [w[0] for w in pos if w[1].startswith('N')]
            lmtz_noun_only = [lmtzr.lemmatize(word) for word in noun_only]
            print lmtz_noun_only
        
    print '\n'

Alta: a detailed dreamer (422 dreams) [F]
  noun-only sample dream: 
[u'one', u'Meads', u'house', u'village', u'cobblestone', u'street', u'sort', u'man', u'hair', u'thing', u'juggle', u'[', u'house', u']', u'side', u'[', u'set', u'hallway', u'corner', u'room', u'woman', u'blonde', u'hair', u'pageboy', u'stove', u'room', u'aunt', u'round', u'drive', u'bridge', u'creek', u'house', u'Inside', u'couple', u'people', u'string', u'balloon']


Angie: age 18 & 20 (48 dreams) [F]
  noun-only sample dream: 
[u'memory', u'dream', u'setting', u'college', u'campus', u'lady', u'university', u'guy', u'law', u'school', u'information', u'school', u'feeling', u'lady', u'art', u'orientation', u'dream', u'hospital', u'cafe', u'guy', u'dream']


Arlie: a middle-aged woman (212 dreams) [F]
  noun-only sample dream: 
[u'office', u'town', u'town', u'Everyone', u'rest', u'bathroom', u'toilet', u'toilet']


Barb Sanders (3116 dreams) [F]
  noun-only sample dream: 
[u'dream', u'Blake', u'Reta', u'Bill', u'E']


B

In [None]:
#==========================================================================================
# TD-IDF Vectorizer (term frequency-inverse document frequency)
#------------------------
# Code Block Purpose: Process the data to turn raw data into a bag of words
# Methods Used:       TDIDF
#                     Tfidf is numerical statistc that is inteded to reflect on 
#                     how important a word is for acollection of words in adocument. 
#==========================================================================================
y_train       = train_labels
  
# Create a Tfidf Vectorizer
vectorizer    = TfidfVectorizer(stop_words='english')
X_train       = vectorizer.fit_transform(train_data)
feature_names = vectorizer.get_feature_names()
    
# Train the model with a Logistical Regression
logit_model = LogisticRegression(C = 100, penalty='l2')
logit_model.fit(X_train, y_train)
    
# Transform the dev data, and do a prediction
X_dev_data  = vectorizer.transform(dev_data)
Z           = logit_model.predict(X_dev_data)
Z_prob      = logit_model.predict_proba(X_dev_data)
    
# Compute the R values
max_Z_prob   = np.max(Z_prob)
R_doc_values = np.zeros((len(Z),2))
for i in range(0,len(Z)):
    index = int(dev_labels[i])
    R_doc_values[i,0] = i
    R_doc_values[i,1] = max_Z_prob / Z_prob[i, int(dev_labels[i])]

# sort the matrix        
col_sort     = 1
R_doc_values = R_doc_values[np.argsort(R_doc_values[:,col_sort])]
    
# Print out Results
for i in range(1, num_top_values+1):
    index = int(-i)
    #print "index", index
    index2 = int(R_doc_values[index,0])
    #print "index2", R_doc_values[index,:]
    print "\n******************************Example {}******************************".format(i)
    print "\nFor R value = {}".format(R_doc_values[index,1])
    print "Probablity of category guessed:\t{0:.3e}".format( Z_prob[index2,Z[index2]])
    print "Probablity of actual category:\t{0:.3e} ".format(Z_prob[index2,dev_labels[index2]])
    print "\tActual Category:\t", categories[dev_labels[index2]]
    print "\tPredicted Category:\t", categories[Z[index2]]
    print "\nDocument text:\n\t", dev_data[index2]
        
DEBUG_VERBOSITY_HIGH = False
if DEBUG_VERBOSITY_HIGH:
    print "\n****DEBUG_VERBOSTIY_HIGH Print Statements****"
    print "X_train shape:{0}, y_train shape{1}".format(X_train.shape, y_train.shape)
    print "X_dev shape:{0}, dev_labels shape{1}".format(X_dev_data.shape, dev_labels.shape)
    print "Z shape", Z.shape
    print "Zprob shape", Z_prob.shape
    print "Z max", max_Z_prob
    print "Shape of R doc values", R_doc_values.shape
    print "Max of R_doc_value\n", R_doc_values[-num_top_values:len(R_doc_values),:]
      
    