In [1]:
import numpy as np
import pandas as pd
from __future__ import division
from random import shuffle
import xmltodict
import json
import collections
from collections import Counter

import nltk
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import *
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix

from gensim.models import LdaModel
from gensim.corpora import Dictionary

# view all columns of pandas df
pd.set_option('display.max_columns', None)

# nltk.download('wordnet')      # download wordnet if it's not already downloaded

with open ("dreambank-public.xml") as f:
    doc = xmltodict.parse(f.read())

def convert(data):
    if isinstance(data, basestring):
        return str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convert, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convert, data))
    else:
        return data

def left(s, amount):
    return s[:amount]

def right(s, amount):
    return s[-amount:]

def mid(s, offset, amount):
    return s[offset:offset+amount]

# Exploratory Analysis

First we want to print the data to see which fields we are given and how the data looks.

In [2]:
for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' 
    print '  ID: ' + dreamer['id']
    print '  type: ' + dreamer['type']
    print '  sex: ' + dreamer['sex']
    print '  age: ' + dreamer['age']
    
    try:
        print '  time: ' + dreamer['time']
    except:
        pass
    
    print '  sample dream: ' 
    
    odict = dreamer['dream'][0]
    for key, value in odict.items():
        if convert(key) == 'report':
            print '    report: ' + left(convert(value), 200) + '...'
        else:
            print '    ' + convert(key) + ': ' + str(convert(value))
        
    print '\n'

Alta: a detailed dreamer (422 dreams)
  ID: alta
  type: series
  sex: F
  age: A
  time: 1985-1997
  sample dream: 
    number: 1
    date: 1957
    report: The one at the Meads's house, where it's bigger inside than out; there's a European village just inside, with a cobblestone street and a Pied-Piper sort of man with curly hair, he can do things like j...


Angie: age 18 & 20 (48 dreams)
  ID: angie
  type: series
  sex: F
  age: Y
  time: 1996
  sample dream: 
    number: 1-01
    date: 1996-04-03
    report: My memory of this dream is vague. I think the setting is on a college campus. I'm in a cafe and two elderly ladies walk in and start talking to me about a university that a guy I am dating got into fo...


Arlie: a middle-aged woman (212 dreams)
  ID: arlie
  type: series
  sex: F
  age: A
  time: 1992-1998
  sample dream: 
    number: 1
    date: 10/14/92
    report: I am in an office in the town next to the town I grew up in. Everyone is taking a rest. I have to go to the b

In [3]:
print '---Dream collections from individuals---' + '\n'
MultIDs = ['b', 'madeline1-hs', 'madeline2-dorms', 'madeline3-offcampus', 'phil1', 'phil2', 'vietnam_vet']
NumberOfSeries = 1

for dreamer in doc['dreambank']['collection']:
    if dreamer['type'] == 'series':
        print '{' + dreamer['id']  + '} ' + dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'    
        dreamer['w266ID'] = NumberOfSeries
        
        # Assign a dreamer ID that groups the same dreamers together 
        # and skips the dream collections of multiple dreamers
        print 'w266 ID: ' + str(dreamer['w266ID'])
        if dreamer['id'] not in MultIDs:
            print '\n'
            NumberOfSeries += 1
    else:
        dreamer['w266ID'] = 0
        
print "Total Number of individuals to test vs. 'others': " + str(NumberOfSeries - 1)

---Dream collections from individuals---

{alta} Alta: a detailed dreamer (422 dreams) [F]
w266 ID: 1


{angie} Angie: age 18 & 20 (48 dreams) [F]
w266 ID: 2


{arlie} Arlie: a middle-aged woman (212 dreams) [F]
w266 ID: 3


{b} Barb Sanders (3116 dreams) [F]
w266 ID: 4
{b2} Barb Sanders #2 (1138 dreams) [F]
w266 ID: 4


{bosnak} Robert Bosnak: A dream analyst (53 dreams) [M]
w266 ID: 5


{chris} Chris: a transvestite (100 dreams) [M]
w266 ID: 6


{chuck} Chuck: a physical scientist (75 dreams) [M]
w266 ID: 7


{dahlia} Dahlia: concerns with appearance (24 dreams) [F]
w266 ID: 8


{david} David: teenage dreams (166 dreams) [M]
w266 ID: 9


{dorothea} Dorothea: 53 years of dreams (900 dreams) [F]
w266 ID: 10


{ed} Ed: dreams of his late wife (143 dreams) [M]
w266 ID: 11


{edna} Edna: a blind woman (19 dreams) [F]
w266 ID: 12


{elizabeth} Elizabeth: a woman in her 40s (1707 dreams) [F]
w266 ID: 13


{emma} Emma: 48 years of dreams (1521 dreams) [F]
w266 ID: 14


{emmas_husband} Emma's

In [4]:
DreamNum = 0

for dreamer in doc['dreambank']['collection']:
    for odict in dreamer['dream']:
        for key, value in odict.items():            
            if convert(key) == 'report':
                DreamNum += 1

print "Total Dreams: " + str(DreamNum)

Total Dreams: 26000


# EDA - Reduce To Noun-Only And Lemmatize

For our topic modeling, we will want to lemmatize the corpus and reduce to nouns-only. However, before we get to topic modeling, it will be helpful to test out the lemmatization and noun-only reduction techniques.

In [5]:
# reduce to noun-only and lemmatize

lmtzr = WordNetLemmatizer()

for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'
    
    print '  noun-only sample dream: ' 
    
    odict = dreamer['dream'][0]
    
    # (1) tokenize dream text
    # (2) tag with PoS
    # (3) reduce to noun-only
    # (4) lemmatize nouns
    # (5) get frequency counts of the lemmatized nouns
    for key, value in odict.items():
        if convert(key) == 'report':
            text = nltk.tokenize.word_tokenize(value)
            pos = nltk.pos_tag(text)
            noun_only = [w[0] for w in pos if w[1].startswith('N')]
            lmtz_noun_only = [lmtzr.lemmatize(word) for word in noun_only]
            counts = Counter(lmtz_noun_only)
            print counts
        
    print '\n'

Alta: a detailed dreamer (422 dreams) [F]
  noun-only sample dream: 
Counter({u'house': 3, u'hair': 2, u'[': 2, u'room': 2, u'bridge': 1, u'set': 1, u'creek': 1, u'people': 1, u'stove': 1, u'one': 1, u'street': 1, u'village': 1, u'corner': 1, u'blonde': 1, u'string': 1, u'pageboy': 1, u'balloon': 1, u'juggle': 1, u'couple': 1, u'sort': 1, u'hallway': 1, u'woman': 1, u'Meads': 1, u'Inside': 1, u'cobblestone': 1, u']': 1, u'man': 1, u'drive': 1, u'round': 1, u'thing': 1, u'aunt': 1, u'side': 1})


Angie: age 18 & 20 (48 dreams) [F]
  noun-only sample dream: 
Counter({u'dream': 3, u'school': 2, u'guy': 2, u'lady': 2, u'hospital': 1, u'information': 1, u'feeling': 1, u'art': 1, u'orientation': 1, u'university': 1, u'cafe': 1, u'setting': 1, u'college': 1, u'memory': 1, u'law': 1, u'campus': 1})


Arlie: a middle-aged woman (212 dreams) [F]
  noun-only sample dream: 
Counter({u'town': 2, u'toilet': 2, u'bathroom': 1, u'Everyone': 1, u'office': 1, u'rest': 1})


Barb Sanders (3116 dreams) [F


Kenneth (2022 dreams) [M]
  noun-only sample dream: 
Counter({u'<': 4, u'>': 3, u'room': 3, u'Ned': 2, u'BR': 2, u'table': 2, u'Major': 1, u'Stallone': 1, u'/I': 1, u'Hall': 1, u'Dormitory': 1, u'guard': 1, u'corner': 1, u'cream': 1, u'enters': 1, u'bowl': 1, u'ice': 1, u'chunk': 1, u'friend': 1, u'Room': 1, u'sitting': 1, u'stop': 1, u'Disease': 1, u'fruit': 1, u'mouth': 1, u'television': 1, u'mask': 1, u'disease': 1, u'rectangular': 1, u'classmate': 1, u'security': 1, u'Kevin': 1})


Madeline 1: High School (98 dreams) [F]
  noun-only sample dream: 
Counter({u'body': 2, u'area': 2, u'river': 2, u'corps': 1, u'father': 1, u'wave': 1, u'reason': 1, u'struggle': 1, u'way': 1})


Madeline 2: College Dorms (186 dreams) [F]
  noun-only sample dream: 
Counter({u'seven': 1, u'hug': 1, u'reference': 1, u'Gerald': 1, u'Romans': 1, u'thing': 1, u'Grandpa': 1, u'dream': 1})


Madeline 3: Off-Campus (348 dreams) [F]
  noun-only sample dream: 
Counter({u'Grandma': 2, u'Jane': 2, u'dream': 2, u'sh

Counter({u'nbsp': 12, u'BR': 2, u'>': 2, u'boy': 1, u'house': 1, u'gun': 1, u'alley': 1, u'man': 1, u'shortcut': 1, u'dream': 1, u'day': 1})


Toby: A friendly party animal (33 dreams) [M]
  noun-only sample dream: 
Counter({u'bathroom': 3, u'door': 3, u'people': 2, u'hallway': 2, u'room': 2, u'right': 1, u'Arcade': 1, u'wall': 1, u'hoop': 1, u'one': 1, u'ball': 1, u'something': 1, u'fake': 1, u'table': 1, u'girl': 1, u'trip': 1, u'bunch': 1, u'apartment': 1, u'end': 1, u'stair': 1, u'machine': 1, u'lot': 1, u'Super': 1, u'head': 1, u'noise': 1, u'poster': 1, u'stop': 1, u'game': 1, u'basketball': 1, u'front': 1, u'walk': 1, u'kitchen': 1, u'building': 1, u'towel': 1, u'restaurant': 1, u'sex': 1, u'Boardwalk': 1, u'stuff': 1, u'time': 1, u'chick': 1, u'guy': 1, u'feeling': 1})


Tom: An outgoing man (27 dreams) [M]
  noun-only sample dream: 
Counter({u'girl': 3, u'boy': 3, u'boyfriend': 2, u'classroom': 2, u'room': 2, u'atmosphere': 1, u'people': 1, u'hair': 1, u'year': 1, u'toy': 1, u

# Create Dataframe

In [6]:
ID = 0
W266ID = 0
DreamBankID = ''
DreamNumber = ''
Name = ''
Sex = ''
Dream = ''
HasDream = 0

df = pd.DataFrame(columns=["ID", "W266ID", "DreamBankID", "DreamNumber", "Name", "Sex", "Dream"])

for dreamer in doc['dreambank']['collection']:
    Name = dreamer['name'] 
    DreamBankID = dreamer['id']
    W266ID = int(dreamer['w266ID'])
    Sex = dreamer['sex']
    
    for odict in dreamer['dream']:
        HasDream = 0
        for key, value in odict.items():
            if convert(key) == 'report':
                Dream = convert(value)
                HasDream = 1
            if convert(key) == 'number':
                DreamNumber = convert(value)
        
        if HasDream == 1:
            ID += 1
            df = df.append({
                "ID": ID,
                "W266ID": W266ID,
                "DreamBankID": DreamBankID,
                "DreamNumber": DreamNumber,
                "Name": Name,
                "Sex": Sex,
                "Dream": Dream
                }, ignore_index=True)        
#     print '\n'

# print df
print "Total Dreams: " + str(ID)
print "\n"

df.head()

Total Dreams: 26000




Unnamed: 0,ID,W266ID,DreamBankID,DreamNumber,Name,Sex,Dream
0,1,1,alta,1,Alta: a detailed dreamer,F,"The one at the Meads's house, where it's bigge..."
1,2,1,alta,2,Alta: a detailed dreamer,F,I'm at a family reunion in a large fine house ...
2,3,1,alta,3,Alta: a detailed dreamer,F,I watch a plane fly past and shortly realize i...
3,4,1,alta,4,Alta: a detailed dreamer,F,Me pulling the green leaves and berries off so...
4,5,1,alta,5,Alta: a detailed dreamer,F,I'm in a room that reminds me of (but definite...


In [7]:
print 'Males: ' + str(len(df[df['Sex']=='M']))
print 'Females: ' + str(len(df[df['Sex']=='F']))
print 'Total: ' + str(len(df[df['Sex']=='M']) + len(df[df['Sex']=='F']))

Males: 7813
Females: 18187
Total: 26000


# Modeling

We want to create a different model for each dreamer (i.e. a classifier identifying one vs all-others for each dreamer). This will allow us to identify the most predictive words in identifying each dreamer.

In [8]:
# randomly shuffle dataframe
# set seed for consistency while running
np.random.seed(0)
df = df.sample(frac=1).reset_index(drop=True)
    
# add 41 flag columns denoting dreamers (one col per dreamer) 
dreamer_flag = pd.get_dummies(df['W266ID'], prefix='Dreamer')
df = pd.concat([df, dreamer_flag], axis=1)

# create vocab from all dreams
dreams_flat = df['Dream'].values.flatten().tolist()
dreams_list = " ".join(dreams_flat)
vocab = list(set(nltk.tokenize.word_tokenize(dreams_list)))

def split_data(df, W266ID, train=0.6):        
    # column for "our" dreamer
    dreamer_label = 'Dreamer_' + str(W266ID)

    # make 60/40 split of train/test
    # test will be evenly split between dev and test in the next step
    num_train = int(len(df) * train)
    num_test = int(len(df) * (1-train)) 

    train_data, train_labels = df['Dream'][:num_train], df[dreamer_label][:num_train]
    dev_data, dev_labels = df['Dream'][-num_test : -num_test//2], df[dreamer_label][-num_test : -num_test//2] 
    test_data, test_labels = df['Dream'][-num_test//2:], df[dreamer_label][-num_test//2:]

    return train_data, train_labels, dev_data, dev_labels, test_data, test_labels

In [9]:
# split data for dreamer1 - Alta
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=1)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

  'precision', 'predicted', average, warn_for)


Logistic Reg:	 C=0.0001	 BOW: F1-score=0.496	 TFIDF: F1-score=0.496
Logistic Reg:	 C=0.0010	 BOW: F1-score=0.778	 TFIDF: F1-score=0.496
Logistic Reg:	 C=0.0100	 BOW: F1-score=0.878	 TFIDF: F1-score=0.496
Logistic Reg:	 C=0.1000	 BOW: F1-score=0.916	 TFIDF: F1-score=0.496
Logistic Reg:	 C=1.0000	 BOW: F1-score=0.933	 TFIDF: F1-score=0.712
Logistic Reg:	 C=10.0000	 BOW: F1-score=0.930	 TFIDF: F1-score=0.872
Logistic Reg:	 C=100.0000	 BOW: F1-score=0.934	 TFIDF: F1-score=0.912
Logistic Reg:	 C=500.0000	 BOW: F1-score=0.931	 TFIDF: F1-score=0.909

Best model:	 C=100.0000	 vectorizer = BOW	 F1-score=0.934


In [10]:
# split data for dreamer2 - Angie
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=2)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

Logistic Reg:	 C=0.0001	 BOW: F1-score=0.500	 TFIDF: F1-score=0.500
Logistic Reg:	 C=0.0010	 BOW: F1-score=0.500	 TFIDF: F1-score=0.500
Logistic Reg:	 C=0.0100	 BOW: F1-score=0.500	 TFIDF: F1-score=0.500
Logistic Reg:	 C=0.1000	 BOW: F1-score=0.500	 TFIDF: F1-score=0.500
Logistic Reg:	 C=1.0000	 BOW: F1-score=0.500	 TFIDF: F1-score=0.500
Logistic Reg:	 C=10.0000	 BOW: F1-score=0.590	 TFIDF: F1-score=0.500
Logistic Reg:	 C=100.0000	 BOW: F1-score=0.590	 TFIDF: F1-score=0.500
Logistic Reg:	 C=500.0000	 BOW: F1-score=0.590	 TFIDF: F1-score=0.500

Best model:	 C=10.0000	 vectorizer = BOW	 F1-score=0.590


In [11]:
# split data for dreamer3 - Arlie
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=3)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

Logistic Reg:	 C=0.0001	 BOW: F1-score=0.498	 TFIDF: F1-score=0.498
Logistic Reg:	 C=0.0010	 BOW: F1-score=0.498	 TFIDF: F1-score=0.498
Logistic Reg:	 C=0.0100	 BOW: F1-score=0.527	 TFIDF: F1-score=0.498
Logistic Reg:	 C=0.1000	 BOW: F1-score=0.672	 TFIDF: F1-score=0.498
Logistic Reg:	 C=1.0000	 BOW: F1-score=0.765	 TFIDF: F1-score=0.498
Logistic Reg:	 C=10.0000	 BOW: F1-score=0.773	 TFIDF: F1-score=0.580
Logistic Reg:	 C=100.0000	 BOW: F1-score=0.780	 TFIDF: F1-score=0.689
Logistic Reg:	 C=500.0000	 BOW: F1-score=0.780	 TFIDF: F1-score=0.708

Best model:	 C=100.0000	 vectorizer = BOW	 F1-score=0.780


In [12]:
# # split data for dreamer1 - Alta
# train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=1)
    
# # Create a Bag-of-Words Vectorizer
# vec = CountVectorizer(vocabulary=vocab)
# vec_train_data = vec.fit_transform(train_data)
# vec_dev_data = vec.transform(dev_data)  

# log = LogisticRegression(C = 100)
# log.fit(vec_train_data, train_labels)

# test_df = pd.DataFrame('Labels': [dev_labels], 'Prediction': log.predict(vec_dev_data), 'Correct_Pred': dev_labels==log.predict(vec_dev_data))
# data = {'Labels': dev_labels, 'Prediction': log.predict(vec_dev_data), 'Correct_Pred': dev_labels==log.predict(vec_dev_data)}
# test_df = pd.DataFrame(data)
# test_df.head(30)

# print confusion_matrix(dev_labels, log.predict(vec_dev_data))

## Run a model for each dreamer

We need to run a separate model for each dreamer. The models will predict if a dream comes from that dreamer or from "all-others". Since bag-of-words was working best in the above models, we will continue to use that for our baseline.

In [13]:
# Run Logisitic Regression for each dreamer

models = {}

for i in range(0,41):
    # split data
    train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=i)

    # Create a Bag-of-Words Vectorizer
    vec = CountVectorizer(vocabulary=vocab)
    vec_train_data = vec.fit_transform(train_data)
    vec_dev_data = vec.transform(dev_data)  

    ## Logistic reg
    log = LogisticRegression(C = 100)
    log.fit(vec_bow_train_data, train_labels)

    # score model
    f1_score = metrics.f1_score(dev_labels, log.predict(vec_dev_data), average='macro')
    
    # find the most-predictive features
    ### we're not using the weights currently, but might be useful/interesting later ###    
    best_feature_positions = log.coef_.argsort()[0][-5::]
    best_feature_weights = log.coef_[0][best_feature_positions.astype(int)]

    # get word labels for our features
    words = []
    for ft in best_feature_positions.astype(int):
        words.append(vec.get_feature_names()[ft])
    
    models[i] = (log, f1_score, words)

## Most Predictive Words

Now that we have a separate model for each dreamer, we can pull out the most predictive words from each model. This shows, for a given dreamer, which words are most predictive of their dreams as opposed to someone else's dream.

In [14]:
for key, (model, score, predictive_words) in models.iteritems():
    print 'W266ID='+str(key), '\tMost Predictive', predictive_words

W266ID=0 	Most Predictive ['border', 'pregnant', 'characters', 'setting', 'dreamed']
W266ID=1 	Most Predictive ['rather', 'somewhere', 'somebody', 'here', 're']
W266ID=2 	Most Predictive ['children', 'quarter', 'campus', 'meet', 'pancakes']
W266ID=3 	Most Predictive ['hometown', 'poison', 'picture', 'husband', 'model']
W266ID=4 	Most Predictive ['helpful', 'nightmares', 'nightmare', 'neat', 'scary']
W266ID=5 	Most Predictive ['stands', 'leading', 'red', 'ghost', 'world']
W266ID=6 	Most Predictive ['street', 'bunk', 'note', 'stranger', 'reach']
W266ID=7 	Most Predictive ['surprise', 'standing', 'real', 'burglar', 'few']
W266ID=8 	Most Predictive ['looked', 'toilet', 'red', 'bathroom', 'period']
W266ID=9 	Most Predictive ['definitely', 'stack', 'abstract', 'buick', 'insured']
W266ID=10 	Most Predictive ['mixed', 'preaching', 'teaching', 'miss', 'hats']
W266ID=11 	Most Predictive ['dream', 'mood', 'both', 'clearly', 'looks']
W266ID=12 	Most Predictive ['laid', 'medicine', 'chest', 'radio'

## Predict Each Dreamer

We can use our models to predict which person a given dream came from. If we take a dream from our test set and run all 41 models on that dream, we will get probabilities of the dream coming from that person. We can then take the highest probability and make that our prediction for who the dream came from. 

In [15]:
# predict dreamer for each test dream
vec_test_data = vec.transform(test_data)

preds = []
# for dream in range(len(test_labels)):
for dream in range(len(test_labels)):
    highest_prob = 0
    
    # predicted probability of the correct label for each model
    for key, (model, score, w) in models.iteritems():
        prob_correct =  model.predict_proba(vec_test_data[dream])[0][1]
        if prob_correct > highest_prob:
            highest_prob = prob_correct
            pred = key
    
    preds.append(pred)
    
print sum(preds == df['W266ID'][20800:]) / len(preds)

  np.exp(prob, prob)


0.814230769231


We got an 82% success rate on our test data.

In [16]:
df[df["W266ID"]!=0][['W266ID', 'Name']].sort_values(by= ["W266ID"]).drop_duplicates()

Unnamed: 0,W266ID,Name
11171,1,Alta: a detailed dreamer
7842,2,Angie: age 18 & 20
15181,3,Arlie: a middle-aged woman
8222,4,Barb Sanders
8209,4,Barb Sanders #2
15894,5,Robert Bosnak: A dream analyst
24910,6,Chris: a transvestite
11756,7,Chuck: a physical scientist
1545,8,Dahlia: concerns with appearance
16737,9,David: teenage dreams


In [41]:
dreams = list(df['Dream'])

# Split the documents into tokens.
for idx in range(len(dreams)):
    dreams[idx] = dreams[idx].lower()  # Convert to lowercase.
    dreams[idx] = nltk.tokenize.word_tokenize(dreams[idx])  # Split into words.
    dreams[idx] = nltk.pos_tag(dreams[idx])  # tag with PoS
    dreams[idx] = [token for token, tag in dreams[idx] if tag.startswith('N')]   # only keep nouns 
    
# Remove numbers, but not words that contain numbers.
dreams = [[token for token in dream if not token.isdigit()] for dream in dreams]

# Remove words that are only one or two characters.
dreams = [[token for token in dream if len(token) > 2] for dream in dreams]

In [42]:
# Lemmatize the dreams.
lmtzr = WordNetLemmatizer()
dreams = [[lmtzr.lemmatize(token) for token in dream] for dream in dreams]

In [43]:
# Remove rare and common tokens.

# Create a dictionary representation of the dreams.
dictionary = Dictionary(dreams)

# Filter out words that occur less than 10 dreams, or more than 60% of the dreams.
dictionary.filter_extremes(no_below=10, no_above=0.6)

# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(dream) for dream in dreams]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of dreams: %d' % len(corpus))

Number of unique tokens: 5223
Number of dreams: 26000


__BELOW IS FROM: https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html__

__HELPS EXPLAIN THE PARAMETERS__

## Training

*We are ready to train the LDA model. We will first discuss how to set some of the training parameters.*

*First of all, the elephant in the room: how many topics do I need? There is really no easy answer for this, it will depend on both your data and your application. I have used 10 topics here because I wanted to have a few topics that I could interpret and "label", and because that turned out to give me reasonably good results. You might not need to interpret all your topics, so you could use a large number of topics, for example 100.*

*The `chunksize` controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory. I've set chunksize = 2000, which is more than the amount of documents, so I process all the data in one go. Chunksize can however influence the quality of the model, as discussed in Hoffman and co-authors [2], but the difference was not substantial in this case.*

*`passes` controls how often we train the model on the entire corpus. Another word for passes might be "epochs". iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of "passes" and "iterations" high enough.*

*I suggest the following way to choose iterations and passes. First, enable logging (as described in many Gensim tutorials), and set eval_every = 1 in LdaModel. When training the model look for a line in the log that looks something like this: *

    `2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations`

*If you set `passes` = 20 you will see this line 20 times. Make sure that by the final passes, most of the documents have converged. So you want to choose both passes and iterations to be high enough for this to happen.*

*We set `alpha = 'auto'` and `eta = 'auto'`. Again this is somewhat technical, but essentially we are automatically learning two parameters in the model that we usually would have to specify explicitly. *

In [44]:
# Train LDA model.

# Set training parameters.
num_topics = 20
chunksize = 20000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

CPU times: user 19min 1s, sys: 6.97 s, total: 19min 8s
Wall time: 19min 21s


In [47]:
top_topics = model.top_topics(corpus, topn=10)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -2.3095.
[([(0.055064558910330255, u'/td'),
   (0.044130003196017981, u'question'),
   (0.042390347816809594, u'answer'),
   (0.038506375150453784, u'table'),
   (0.034336076076833741, u'dream'),
   (0.033645111069543078, u'top'),
   (0.031849047812491094, u'interpretation'),
   (0.02890225959639153, u'border='),
   (0.028902259595960093, u'cellspacing='),
   (0.028902259592280564, u'valign=')],
  -0.71875881098714067),
 ([(0.067367510439023809, u'guy'),
   (0.038444835895236978, u'mom'),
   (0.032135719964610718, u'something'),
   (0.030012490809668772, u'people'),
   (0.022977141306901747, u'ezra'),
   (0.022451870968813958, u'thing'),
   (0.019974012538505639, u'someone'),
   (0.01913776440962903, u'girl'),
   (0.018948881330287531, u'dad'),
   (0.016942702767371878, u'car')],
  -1.4988369256457525),
 ([(0.097240511904362059, u'friend'),
   (0.044891180628992756, u'dream'),
   (0.034302357564756518, u'/li'),
   (0.027407449431211553, u'house'),
   (0.0237698

In [48]:
# shape: num_topics x vocabulary_size
# print model.get_topics()

model.print_topics()

[(0,
  u'0.133*"water" + 0.043*"boat" + 0.036*"pool" + 0.024*"beach" + 0.024*"river" + 0.022*"lake" + 0.018*"rock" + 0.018*"wave" + 0.013*"sand" + 0.013*"ocean"'),
 (1,
  u'0.093*"room" + 0.048*"door" + 0.031*"house" + 0.022*"floor" + 0.018*"thing" + 0.016*"wall" + 0.015*"people" + 0.014*"bathroom" + 0.014*"window" + 0.012*"bed"'),
 (2,
  u'0.097*"friend" + 0.045*"dream" + 0.034*"/li" + 0.027*"house" + 0.024*"feeling" + 0.022*"character" + 0.020*"thought" + 0.020*"setting" + 0.019*"girl" + 0.017*"people"'),
 (3,
  u'0.044*"something" + 0.027*"people" + 0.026*"thing" + 0.020*"time" + 0.018*"way" + 0.018*"sort" + 0.014*"building" + 0.014*"kind" + 0.014*"place" + 0.012*"office"'),
 (4,
  u'0.029*"woman" + 0.025*"hair" + 0.020*"man" + 0.018*"doctor" + 0.011*"time" + 0.011*"hand" + 0.010*"face" + 0.010*"eye" + 0.010*"paper" + 0.010*"glass"'),
 (5,
  u'0.041*"table" + 0.034*"food" + 0.016*"jeremy" + 0.016*"boyfriend" + 0.016*"mother" + 0.015*"room" + 0.015*"something" + 0.012*"dinner" + 0.01

In [25]:
# df.groupby('W266ID').agg(sum)

In [72]:
### Run Topic Modeling on each dreamer separately
import time

# dict to store topic models for each dreamer
topic_models = dict()

for dreamer_id in range(1,41):
    loop_start_time = time.time()
    
    dreams = list(df[df['W266ID'] == dreamer_id]['Dream'])

    # Split the documents into tokens.
    for idx in range(len(dreams)):
        dreams[idx] = dreams[idx].lower()  # Convert to lowercase.
        dreams[idx] = nltk.tokenize.word_tokenize(dreams[idx])  # Split into words.
        dreams[idx] = nltk.pos_tag(dreams[idx])  # tag with PoS
        dreams[idx] = [token for token, tag in dreams[idx] if tag.startswith('N')]   # only keep nouns 

    # Remove numbers, but not words that contain numbers.
    dreams = [[token for token in dream if not token.isdigit()] for dream in dreams]

    # Remove words that are only one or two characters.
    dreams = [[token for token in dream if len(token) > 2] for dream in dreams]

    # Lemmatize the dreams.
    lmtzr = WordNetLemmatizer()
    dreams = [[lmtzr.lemmatize(token) for token in dream] for dream in dreams]


    ## Remove rare and common tokens.

    # Create a dictionary representation of the dreams.
    dictionary = Dictionary(dreams)

    # Filter out words that occur less than 4 dreams
    dictionary.filter_extremes(no_below=4, no_above=0.6)

    # Vectorize data.
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(dream) for dream in dreams]

    # Set training parameters.
    num_topics = 10
    chunksize = 3000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    mode_start_time = time.time()
    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                           alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, \
                           passes=passes, eval_every=eval_every)
    print 'DreamerID', str(dreamer_id) + ': Total cell time=' + str(time.time() - loop_start_time), \
            '\t Model build time=' + str(time.time() - mode_start_time)
    
    topic_models[dreamer_id] = (model, corpus, dictionary)
    


DreamerID 1: Total cell time=49.4911928177 	 Model build time=26.4035279751
DreamerID 2: Total cell time=2.82385802269 	 Model build time=1.57753205299
DreamerID 3: Total cell time=9.20365190506 	 Model build time=6.6192779541
DreamerID 4: Total cell time=332.334140062 	 Model build time=219.808418989
DreamerID 5: Total cell time=2.07742094994 	 Model build time=1.08613610268
DreamerID 6: Total cell time=4.37294483185 	 Model build time=2.71170115471
DreamerID 7: Total cell time=3.43011808395 	 Model build time=2.01723599434
DreamerID 8: Total cell time=0.709416866302 	 Model build time=0.404046058655
DreamerID 9: Total cell time=9.62782883644 	 Model build time=5.53350019455
DreamerID 10: Total cell time=32.1868970394 	 Model build time=24.0199761391
DreamerID 11: Total cell time=7.18511199951 	 Model build time=4.537555933
DreamerID 12: Total cell time=0.837099790573 	 Model build time=0.347249984741
DreamerID 13: Total cell time=146.429958105 	 Model build time=86.1768481731
Dreamer

In [73]:
for k, v in topic_models.iteritems():
    print "\nDreamer", k
    pprint(v[0].top_topics(corpus = v[1], topn=5))


Dreamer 1
[([(0.030314259605608344, u'room'),
   (0.027825182783776773, u'people'),
   (0.025724310700485625, u'sort'),
   (0.023223748246904451, u'thing'),
   (0.02014365185281666, u'place')],
  -0.44810313087107534),
 ([(0.026548064191804294, u'thing'),
   (0.022126899716823609, u'sort'),
   (0.020924236782516613, u'people'),
   (0.019618624797874882, u'place'),
   (0.01799114868350114, u'way')],
  -0.49147990728090274),
 ([(0.0259315376647101, u'thing'),
   (0.021668772940416782, u'place'),
   (0.018146416611411493, u'kind'),
   (0.017857193374309861, u'people'),
   (0.017691400050331645, u'sort')],
  -0.49752345564668599),
 ([(0.022528246637740133, u'house'),
   (0.020176136767386818, u'room'),
   (0.01873917030394576, u'door'),
   (0.018502851601308896, u'thing'),
   (0.017710093286734143, u'sort')],
  -0.51324467244643079),
 ([(0.024083958181051694, u'people'),
   (0.021367733053956874, u'room'),
   (0.021230932335568976, u'house'),
   (0.020343281587138357, u'thing'),
   (0.018

   (0.083734412080212456, u'woman'),
   (0.064084333567878862, u'mother'),
   (0.055471353243350993, u'friend'),
   (0.048440555186666033, u'place')],
  -1.1687883414115725),
 ([(0.097004239315851143, u'woman'),
   (0.090030818239785829, u'man'),
   (0.071759991173400525, u'world'),
   (0.047850520137092623, u'thing'),
   (0.044000191719375477, u'people')],
  -1.1894150095165386),
 ([(0.19598143823476574, u'brother'),
   (0.083590792562492813, u'woman'),
   (0.066986964480289513, u'friend'),
   (0.056256002821618648, u'room'),
   (0.052592584957775926, u'john')],
  -1.2546058701274014),
 ([(0.099213066372244724, u'place'),
   (0.073252570349716717, u'time'),
   (0.05837893701721894, u'people'),
   (0.057417620821218415, u'something'),
   (0.05675918797536729, u'night')],
  -1.3078266914160948),
 ([(0.13357949065148028, u'room'),
   (0.096691040945745946, u'woman'),
   (0.058787277485066118, u'friend'),
   (0.054295507224167767, u'dutch'),
   (0.053287771513142165, u'mine')],
  -1.34026

   (0.020214427967707387, u'people'),
   (0.019056542070773504, u'way')],
  -3.5846262529806667),
 ([(0.038996236038781676, u'laura'),
   (0.032471173787788341, u'time'),
   (0.027565728249975684, u'car'),
   (0.019351680011837687, u'spider'),
   (0.018281243408530774, u'school')],
  -6.8489198840698862)]

Dreamer 10
[([(0.030607087385606926, u'man'),
   (0.027309529810133227, u'room'),
   (0.022609272287971584, u'woman'),
   (0.022411500514282444, u'door'),
   (0.020172046897974322, u'child')],
  -1.5525858636242775),
 ([(0.034277777437187962, u'room'),
   (0.026582729309485763, u'mother'),
   (0.023949154834386515, u'time'),
   (0.021610380642292912, u'child'),
   (0.021116723677641565, u'bed')],
  -1.9569295277162511),
 ([(0.029536291396297737, u'mother'),
   (0.024517475883540172, u'room'),
   (0.017091780818104606, u'seat'),
   (0.01373422411776074, u'side'),
   (0.013630639288151879, u'boy')],
  -2.0050945837742415),
 ([(0.028354105658976402, u'word'),
   (0.026694884833370906, u

[([(0.35935817036481232, u'nbsp'),
   (0.013512803659332122, u'andrew'),
   (0.01280552391738623, u'father'),
   (0.011573581111993553, u'church'),
   (0.011528652470430851, u'frank')],
  -1.1952456096975237),
 ([(0.08778713252756512, u'father'),
   (0.081407694926144483, u'andrew'),
   (0.032958155661064116, u'church'),
   (0.030785551313388223, u'marissa'),
   (0.017017103182830326, u'frank')],
  -1.4463461115880816),
 ([(0.027772451143392871, u'center'),
   (0.02773923309855739, u'home'),
   (0.025324408761246652, u'frank'),
   (0.025209596778892021, u'retreat'),
   (0.021022212318749151, u'people')],
  -1.5272785836077216),
 ([(0.026896247808963873, u'people'),
   (0.017497115070452811, u'book'),
   (0.017435669676166225, u'frank'),
   (0.016188782592077171, u'church'),
   (0.01510340117884443, u'house')],
  -1.7661468257215873),
 ([(0.040410485654533264, u'house'),
   (0.028147392739446837, u'friend'),
   (0.017136935189584816, u'room'),
   (0.016072623578698875, u'shift'),
   (0.

   (0.021448664449974144, u'friend'),
   (0.015842172505275583, u'basketball')],
  -1.4145101318937392),
 ([(0.023601675912633211, u'friend'),
   (0.022405722948047772, u'water'),
   (0.018426189126162312, u'fish'),
   (0.012318434745624073, u'line'),
   (0.01070200306721023, u'people')],
  -1.6780009899989339),
 ([(0.028556846588807101, u'plane'),
   (0.018985296115709413, u'man'),
   (0.016076824809205935, u'people'),
   (0.014746580924042442, u'friend'),
   (0.01338941502194968, u'card')],
  -1.7879562960765081),
 ([(0.023353990525137695, u'friend'),
   (0.021568468631715583, u'room'),
   (0.016396441314594976, u'redding'),
   (0.015229596983730269, u'door'),
   (0.014253539103725068, u'deer')],
  -2.6745889211588323)]

Dreamer 22
[([(0.041243877641839212, u'friend'),
   (0.029737402941971816, u'jeremy'),
   (0.025466456541439073, u'room'),
   (0.021984511783757862, u'mother'),
   (0.019812698646473267, u'something')],
  -0.97742061304899386),
 ([(0.024307686336177636, u'jeremy'),
 

   (0.029932498650055626, u'thing'),
   (0.024529563440424184, u'people'),
   (0.022421879051500539, u'place'),
   (0.020133672677825995, u'anything')],
  -0.70799288952638861),
 ([(0.027996524149067361, u'brother'),
   (0.027698440225647835, u'husband'),
   (0.027652316132139292, u'mother'),
   (0.027390314564535476, u'people'),
   (0.022696401485786583, u'place')],
  -0.71259026666771053),
 ([(0.032601576306987172, u'people'),
   (0.029788271269787257, u'thing'),
   (0.024965220862025291, u'place'),
   (0.023952082699730381, u'husband'),
   (0.022301376286370987, u'man')],
  -0.72034281404641143),
 ([(0.037132028839297518, u'thing'),
   (0.032120148774266034, u'people'),
   (0.019862132388542182, u'woman'),
   (0.017385165457395796, u'way'),
   (0.017122706814051116, u'husband')],
  -0.72782022551120007),
 ([(0.037368396750332718, u'people'),
   (0.031427047295425795, u'thing'),
   (0.022589962491962264, u'time'),
   (0.019675673732832948, u'room'),
   (0.019496801636235732, u'husban

   (0.0324465663652036, u'time'),
   (0.023330716166070395, u'mother'),
   (0.022414774652555487, u'building')],
  -2.0422738580608222),
 ([(0.084758291110928877, u'mother'),
   (0.058605618229433089, u'press'),
   (0.054534983144937273, u'printing'),
   (0.040922293912060696, u'plant'),
   (0.037093581753845695, u'man')],
  -2.1101456325383645),
 ([(0.085653908194770575, u'woman'),
   (0.054569619768699063, u'room'),
   (0.05212817444490811, u'house'),
   (0.045175749178302742, u'boy'),
   (0.038507774025898209, u'sister')],
  -2.2680745586437867),
 ([(0.078460273958819965, u'girl'),
   (0.055727839279913163, u'woman'),
   (0.03904413240612066, u'man'),
   (0.022237426544921078, u'boy'),
   (0.018926393802553003, u'book')],
  -2.9670123291792603),
 ([(0.13360464828524993, u'building'),
   (0.045115672109870449, u'bus'),
   (0.041172015112823153, u'street'),
   (0.025776252541289901, u'wall'),
   (0.025314410992350003, u'man')],
  -4.2556690144537104),
 ([(0.067479343606285755, u'bed')

   (0.051232557677648845, u'people')],
  -1.0795970475275658),
 ([(0.13510759821085433, u'car'),
   (0.088152276296956766, u'girl'),
   (0.054997093592668592, u'party'),
   (0.048144712206132804, u'house'),
   (0.038918753703136376, u'beach')],
  -1.080513712902369),
 ([(0.10737384810799074, u'everyone'),
   (0.082746970549603491, u'girl'),
   (0.063241881666971075, u'time'),
   (0.062185609192493795, u'hey'),
   (0.045426259621299758, u'front')],
  -1.1813269408092701),
 ([(0.084083609204474052, u'look'),
   (0.082486266668200844, u'something'),
   (0.067513504814540687, u'house'),
   (0.067392216948150035, u'guy'),
   (0.064913522326518064, u'head')],
  -1.2992676437121058),
 ([(0.14530516589892717, u'house'),
   (0.10521437952975933, u'people'),
   (0.065773651487774032, u'everyone'),
   (0.053631443312612583, u'guy'),
   (0.051024528913397559, u'sort')],
  -1.4308659062465687),
 ([(0.10839435259449562, u'mall'),
   (0.071633754012129777, u'girl'),
   (0.057410093441228265, u'table'