In [149]:
import numpy as np
import pandas as pd
from __future__ import division
from random import shuffle
import xmltodict
import json
import collections
from collections import Counter

import nltk
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import *
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix

from gensim.models import LdaModel
from gensim.corpora import Dictionary

# view all columns of pandas df
pd.set_option('display.max_columns', None)

# nltk.download('wordnet')      # download wordnet if it's not already downloaded

with open ("../dreambank-public.xml") as f:
    doc = xmltodict.parse(f.read())

def convert(data):
    if isinstance(data, basestring):
        return str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convert, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convert, data))
    else:
        return data

def left(s, amount):
    return s[:amount]

def right(s, amount):
    return s[-amount:]

def mid(s, offset, amount):
    return s[offset:offset+amount]

# Exploratory Analysis

First we want to print the data to see which fields we are given and how the data looks.

In [150]:
for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' 
    print '\tID:\t' + dreamer['id']
    print '\ttype:\t' + dreamer['type']
    print '\tsex:\t' + dreamer['sex']
    print '\tage:\t' + dreamer['age']
    
    try:
        print '\ttime:\t' + dreamer['time']
    except:
        pass
    
    print '\tsample dream: ' 
    
    odict = dreamer['dream'][0]
    for key, value in odict.items():
        if convert(key) == 'report':
            print '\treport: ' + left(convert(value), 200) + '...'
        else:
            print '\t' + convert(key) + ':' + str(convert(value))
        
    print '\n'

Alta: a detailed dreamer (422 dreams)
	ID:	alta
	type:	series
	sex:	F
	age:	A
	time:	1985-1997
	sample dream: 
	number:1
	date:1957
	report: The one at the Meads's house, where it's bigger inside than out; there's a European village just inside, with a cobblestone street and a Pied-Piper sort of man with curly hair, he can do things like j...


Angie: age 18 & 20 (48 dreams)
	ID:	angie
	type:	series
	sex:	F
	age:	Y
	time:	1996
	sample dream: 
	number:1-01
	date:1996-04-03
	report: My memory of this dream is vague. I think the setting is on a college campus. I'm in a cafe and two elderly ladies walk in and start talking to me about a university that a guy I am dating got into fo...


Arlie: a middle-aged woman (212 dreams)
	ID:	arlie
	type:	series
	sex:	F
	age:	A
	time:	1992-1998
	sample dream: 
	number:1
	date:10/14/92
	report: I am in an office in the town next to the town I grew up in. Everyone is taking a rest. I have to go to the bathroom, but there is no toilet so I use an empty c

In [151]:
print '---Dream collections from individuals---' + '\n'
MultIDs = ['b', 'madeline1-hs', 'madeline2-dorms', 'madeline3-offcampus', 'phil1', 'phil2', 'vietnam_vet']
NumberOfSeries = 1

for dreamer in doc['dreambank']['collection']:
    if dreamer['type'] == 'series':
        print '{' + dreamer['id']  + '} ' + dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'    
        dreamer['w266ID'] = NumberOfSeries
        
        # Assign a dreamer ID that groups the same dreamers together 
        # and skips the dream collections of multiple dreamers
        print 'w266 ID: ' + str(dreamer['w266ID'])
        if dreamer['id'] not in MultIDs:
            print '\n'
            NumberOfSeries += 1
    else:
        dreamer['w266ID'] = 0
        
print "Total Number of individuals to test vs. 'others': " + str(NumberOfSeries - 1)

---Dream collections from individuals---

{alta} Alta: a detailed dreamer (422 dreams) [F]
w266 ID: 1


{angie} Angie: age 18 & 20 (48 dreams) [F]
w266 ID: 2


{arlie} Arlie: a middle-aged woman (212 dreams) [F]
w266 ID: 3


{b} Barb Sanders (3116 dreams) [F]
w266 ID: 4
{b2} Barb Sanders #2 (1138 dreams) [F]
w266 ID: 4


{bosnak} Robert Bosnak: A dream analyst (53 dreams) [M]
w266 ID: 5


{chris} Chris: a transvestite (100 dreams) [M]
w266 ID: 6


{chuck} Chuck: a physical scientist (75 dreams) [M]
w266 ID: 7


{dahlia} Dahlia: concerns with appearance (24 dreams) [F]
w266 ID: 8


{david} David: teenage dreams (166 dreams) [M]
w266 ID: 9


{dorothea} Dorothea: 53 years of dreams (900 dreams) [F]
w266 ID: 10


{ed} Ed: dreams of his late wife (143 dreams) [M]
w266 ID: 11


{edna} Edna: a blind woman (19 dreams) [F]
w266 ID: 12


{elizabeth} Elizabeth: a woman in her 40s (1707 dreams) [F]
w266 ID: 13


{emma} Emma: 48 years of dreams (1521 dreams) [F]
w266 ID: 14


{emmas_husband} Emma's

In [152]:
DreamNum = 0

for dreamer in doc['dreambank']['collection']:
    for odict in dreamer['dream']:
        for key, value in odict.items():            
            if convert(key) == 'report':
                DreamNum += 1

print "Total Dreams: " + str(DreamNum)

Total Dreams: 26000


# EDA - Reduce To Noun-Only And Lemmatize

For our topic modeling, we will want to lemmatize the corpus and reduce to nouns-only. However, before we get to topic modeling, it will be helpful to test out the lemmatization and noun-only reduction techniques.

In [153]:
# reduce to noun-only and lemmatize

lmtzr = WordNetLemmatizer()

for dreamer in doc['dreambank']['collection']:
    print dreamer['name'] + ' (' + str(len(dreamer['dream'][:])) + ' dreams)' + ' [' + dreamer['sex'] + ']'
    
    print '  noun-only sample dream: ' 
    
    odict = dreamer['dream'][0]
    
    # (1) tokenize dream text
    # (2) tag with PoS
    # (3) reduce to noun-only
    # (4) lemmatize nouns
    # (5) get frequency counts of the lemmatized nouns
    for key, value in odict.items():
        if convert(key) == 'report':
            text = nltk.tokenize.word_tokenize(value)
            pos = nltk.pos_tag(text)
            noun_only = [w[0] for w in pos if w[1].startswith('N')]
            lmtz_noun_only = [lmtzr.lemmatize(word) for word in noun_only]
            counts = Counter(lmtz_noun_only)
            print counts
        
    print '\n'

Alta: a detailed dreamer (422 dreams) [F]
  noun-only sample dream: 
Counter({u'house': 3, u'hair': 2, u'[': 2, u'room': 2, u'bridge': 1, u'set': 1, u'creek': 1, u'people': 1, u'stove': 1, u'one': 1, u'street': 1, u'village': 1, u'corner': 1, u'blonde': 1, u'string': 1, u'pageboy': 1, u'balloon': 1, u'juggle': 1, u'couple': 1, u'sort': 1, u'hallway': 1, u'woman': 1, u'Meads': 1, u'Inside': 1, u'cobblestone': 1, u']': 1, u'man': 1, u'drive': 1, u'round': 1, u'thing': 1, u'aunt': 1, u'side': 1})


Angie: age 18 & 20 (48 dreams) [F]
  noun-only sample dream: 
Counter({u'dream': 3, u'school': 2, u'guy': 2, u'lady': 2, u'hospital': 1, u'information': 1, u'feeling': 1, u'art': 1, u'orientation': 1, u'university': 1, u'cafe': 1, u'setting': 1, u'college': 1, u'memory': 1, u'law': 1, u'campus': 1})


Arlie: a middle-aged woman (212 dreams) [F]
  noun-only sample dream: 
Counter({u'town': 2, u'toilet': 2, u'bathroom': 1, u'Everyone': 1, u'office': 1, u'rest': 1})


Barb Sanders (3116 dreams) [F

# Create Dataframe

In [154]:
ID = 0
W266ID = 0
DreamBankID = ''
DreamNumber = ''
Name = ''
Sex = ''
Dream = ''
#HasDream = 0
HasDream = False

df = pd.DataFrame(columns=["ID", "W266ID", "DreamBankID", "DreamNumber", "Name", "Sex", "Dream"])

for dreamer in doc['dreambank']['collection']:
    Name = dreamer['name'] 
    DreamBankID = dreamer['id']
    W266ID = dreamer['w266ID']
    Sex = dreamer['sex']
    
    for odict in dreamer['dream']:
        #HasDream = 0
        HasDream = False
        for key, value in odict.items():
            if convert(key) == 'report':
                Dream = convert(value)
                #HasDream = 1
                HasDream = True
            if convert(key) == 'number':
                DreamNumber = convert(value)
        
        #if HasDream == 1:
        if HasDream:
            ID += 1
            df = df.append({
                "ID": ID,
                "W266ID": W266ID,
                "DreamBankID": DreamBankID,
                "DreamNumber": DreamNumber,
                "Name": Name,
                "Sex": Sex,
                "Dream": Dream
                }, ignore_index=True)        
#     print '\n'

# print df
print "Total Dreams: " + str(ID)
print "\n"

print df.head()
print df.shape


KeyboardInterrupt: 

In [None]:
# Some basic information on the DataFrame
print df.head()
print df.describe()
print df.dtypes
print df.shape
print 'Males: ' + str(len(df[df['Sex']=='M']))
print 'Females: ' + str(len(df[df['Sex']=='F']))
print 'Total: ' + str(len(df[df['Sex']=='M']) + len(df[df['Sex']=='F']))

# Modeling

We create 2 different types of models: the first creates a model for each deamer. The second uses all dream data for a corpus, then uses LDA so that all dreams (documents) share the same set of topics but each dream exhibits topics differently, in their probablity of a topic.

## Model 1: Individual Dreamer Model

We want to create a different model for each dreamer (i.e. a classifier identifying one vs all-others for each dreamer). This will allow us to identify the most predictive words in identifying each dreamer.

In [156]:
# randomly shuffle dataframe
# set seed for consistency while running
np.random.seed(0)
df = df.sample(frac=1).reset_index(drop=True)
    
# add 41 flag columns denoting dreamers (one col per dreamer) 
dreamer_flag = pd.get_dummies(df['W266ID'], prefix='Dreamer')
df = pd.concat([df, dreamer_flag], axis=1)

# create vocab from all dreams
dreams_flat = df['Dream'].values.flatten().tolist()
dreams_list = " ".join(dreams_flat)
vocab = list(set(nltk.tokenize.word_tokenize(dreams_list)))

def split_data(df, W266ID, train=0.6):        
    # column for "our" dreamer
    dreamer_label = 'Dreamer_' + str(W266ID)

    # make 60/40 split of train/test
    # test will be evenly split between dev and test in the next step
    num_train = int(len(df) * train)
    num_test = int(len(df) * (1-train)) 
    print "KLR_DEBUG", df[:2]
    
    train_data, train_labels = df['Dream'][:num_train], df[dreamer_label][:num_train]
    dev_data, dev_labels = df['Dream'][-num_test : -num_test//2], df[dreamer_label][-num_test : -num_test//2] 
    test_data, test_labels = df['Dream'][-num_test//2:], df[dreamer_label][-num_test//2:]

    return train_data, train_labels, dev_data, dev_labels, test_data, test_labels

In [None]:
# split data for dreamer1 - Alta
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=1.0)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

In [157]:
# split data for dreamer2 - Angie
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=2.0)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

KLR_DEBUG         ID  W266ID  DreamBankID DreamNumber                       Name Sex  \
0  10736.0     0.0  hall_female        0441  College women, late 1940s   F   
1   9159.0    14.0         emma    1965-037   Emma: 48 years of dreams   F   

                                               Dream  Dreamer_0.0  \
0  In my dream someone tells me that my brother, ...            1   
1  Big banquet at Shirley Roland's - right outsid...            0   

   Dreamer_1.0  Dreamer_2.0  Dreamer_3.0  Dreamer_4.0  Dreamer_5.0  \
0            0            0            0            0            0   
1            0            0            0            0            0   

   Dreamer_6.0  Dreamer_7.0  Dreamer_8.0  Dreamer_9.0  Dreamer_10.0  \
0            0            0            0            0             0   
1            0            0            0            0             0   

   Dreamer_11.0  Dreamer_12.0  Dreamer_13.0  Dreamer_14.0  Dreamer_15.0  \
0             0             0             0    

In [160]:
#print df['Dreamer_2.0'][:10]
#df['Dream'][:10]
print df[df['DreamBankID']== "hall_female"]

            ID  W266ID  DreamBankID DreamNumber                       Name  \
0      10736.0     0.0  hall_female        0441  College women, late 1940s   
14     10517.0     0.0  hall_female        0222  College women, late 1940s   
45     10734.0     0.0  hall_female        0439  College women, late 1940s   
74     10854.0     0.0  hall_female        0559  College women, late 1940s   
78     10728.0     0.0  hall_female        0433  College women, late 1940s   
89     10398.0     0.0  hall_female        0103  College women, late 1940s   
94     10413.0     0.0  hall_female        0118  College women, late 1940s   
103    10562.0     0.0  hall_female        0267  College women, late 1940s   
114    10969.0     0.0  hall_female        0674  College women, late 1940s   
188    10477.0     0.0  hall_female        0182  College women, late 1940s   
207    10654.0     0.0  hall_female        0359  College women, late 1940s   
287    10865.0     0.0  hall_female        0570  College women, 

In [162]:
print type(train_data)
print "training data"
print train_data[0:47]
print train_labels[0:47]
print "df = angie"
print df[df['DreamBankID']== "angie"]
print df[df['W266ID']== 2.0]
print len(df[df['DreamBankID']== "angie"])

<class 'pandas.core.series.Series'>
training data
0     In my dream someone tells me that my brother, ...
1     Big banquet at Shirley Roland's - right outsid...
2     <I>The Orange Suit</I> <BR><BR> I arrive at Wa...
3     I was back at this place that was kind of, it ...
4     I am in a closet when I hear that my Father is...
5     I'm on stage with the same group from the last...
6     I'm at work, which is like a set of bleachers....
7     There was a holocaust happening on the 21/3/05...
8     ["Long time wooing."] I am some jungle primiti...
9     ["Nude notes."] I am talking to a well develop...
10    Lots of dreams, but I only remember snatches. ...
11    I was at a hotel place. I don't remember why b...
12    Mom, Ezra and I went to this restaurant place....
13    Come home by bicycle: I ride up 2 stairs to ki...
14    I was out in a large field--the grass was up t...
15    I was riding this thing I called a bike but it...
16    There was a pool near the road. Some dude had ..

In [None]:
print df[df['ID']==450.0][0:100]

In [None]:
# split data for dreamer3 - Arlie
train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=3)
    
# Create a Bag-of-Words Vectorizer
vec = CountVectorizer(vocabulary=vocab)
vec_bow_train_data = vec.fit_transform(train_data)
vec_bow_dev_data = vec.transform(dev_data)  

# Create a Tfidf Vectorizer
vec_tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf_train_data = vec_tfidf.fit_transform(train_data)
vec_tfidf_dev_data   = vec_tfidf.transform(dev_data)

best_lr_score = 0

## Logistic reg
for c in (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 500):
    bow_log = LogisticRegression(C = c)
    bow_log.fit(vec_bow_train_data, train_labels)
    
    tfidf_logit_model = LogisticRegression(C=c)
    tfidf_logit_model.fit(vec_tfidf_train_data, train_labels)
    
    f1_bow_lr_score = metrics.f1_score(dev_labels, bow_log.predict(vec_bow_dev_data), average='macro')    
    f1_tfidf_lr_score = metrics.f1_score(dev_labels, tfidf_logit_model.predict(vec_tfidf_dev_data), average='macro') 

    print 'Logistic Reg:\t C=%3.4f\t BOW: F1-score=%3.3f\t TFIDF: F1-score=%3.3f' % (c, f1_bow_lr_score, f1_tfidf_lr_score)

    if f1_bow_lr_score > best_lr_score:
        best_lr_score = f1_bow_lr_score
        best_C = c 
        vectorizer = 'BOW'
        
    if f1_tfidf_lr_score > best_lr_score:
        best_lr_score = f1_tfidf_lr_score
        best_C = c
        vectorizer = 'TFIDF'

print ''
print 'Best model:\t C=%3.4f\t vectorizer = %s\t F1-score=%3.3f' % (best_C, vectorizer, best_lr_score)

In [None]:
# # split data for dreamer1 - Alta
# train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=1)
    
# # Create a Bag-of-Words Vectorizer
# vec = CountVectorizer(vocabulary=vocab)
# vec_train_data = vec.fit_transform(train_data)
# vec_dev_data = vec.transform(dev_data)  

# log = LogisticRegression(C = 100)
# log.fit(vec_train_data, train_labels)

# test_df = pd.DataFrame('Labels': [dev_labels], 'Prediction': log.predict(vec_dev_data), 'Correct_Pred': dev_labels==log.predict(vec_dev_data))
# data = {'Labels': dev_labels, 'Prediction': log.predict(vec_dev_data), 'Correct_Pred': dev_labels==log.predict(vec_dev_data)}
# test_df = pd.DataFrame(data)
# test_df.head(30)

# print confusion_matrix(dev_labels, log.predict(vec_dev_data))

## Run a model for each dreamer

We need to run a separate model for each dreamer. The models will predict if a dream comes from that dreamer or from "all-others". Since bag-of-words was working best in the above models, we will continue to use that for our baseline.

In [None]:
# Run Logisitic Regression for each dreamer

models = {}

for i in range(0,41):
    # split data
    train_data, train_labels, dev_data, dev_labels, test_data, test_labels = split_data(df, W266ID=i)

    # Create a Bag-of-Words Vectorizer
    vec = CountVectorizer(vocabulary=vocab)
    vec_train_data = vec.fit_transform(train_data)
    vec_dev_data = vec.transform(dev_data)  

    ## Logistic reg
    log = LogisticRegression(C = 100)
    log.fit(vec_bow_train_data, train_labels)

    # score model
    f1_score = metrics.f1_score(dev_labels, log.predict(vec_dev_data), average='macro')
    
    # find the most-predictive features
    ### we're not using the weights currently, but might be useful/interesting later ###    
    best_feature_positions = log.coef_.argsort()[0][-5::]
    best_feature_weights = log.coef_[0][best_feature_positions.astype(int)]

    # get word labels for our features
    words = []
    for ft in best_feature_positions.astype(int):
        words.append(vec.get_feature_names()[ft])
    
    models[i] = (log, f1_score, words)

## Most Predictive Words

Now that we have a separate model for each dreamer, we can pull out the most predictive words from each model. This shows, for a given dreamer, which words are most predictive of their dreams as opposed to someone else's dream.

In [None]:
for key, (model, score, predictive_words) in models.iteritems():
    print 'W266ID='+str(key), '\tMost Predictive', predictive_words

## Predict Each Dreamer

We can use our models to predict which person a given dream came from. If we take a dream from our test set and run all 41 models on that dream, we will get probabilities of the dream coming from that person. We can then take the highest probability and make that our prediction for who the dream came from. 

In [None]:
# predict dreamer for each test dream
vec_test_data = vec.transform(test_data)

preds = []
# for dream in range(len(test_labels)):
for dream in range(len(test_labels)):
    highest_prob = 0
    
    # predicted probability of the correct label for each model
    for key, (model, score, w) in models.iteritems():
        prob_correct =  model.predict_proba(vec_test_data[dream])[0][1]
        if prob_correct > highest_prob:
            highest_prob = prob_correct
            pred = key
    
    preds.append(pred)
    
print sum(preds == df['W266ID'][20800:]) / len(preds)

We got an 82% success rate on our test data.

In [None]:
df[df["W266ID"]!=0][['W266ID', 'Name']].sort_values(by= ["W266ID"]).drop_duplicates()

In [None]:
dreams = list(df['Dream'])

# Split the documents into tokens.
for idx in range(len(dreams)):
    dreams[idx] = dreams[idx].lower()  # Convert to lowercase.
    dreams[idx] = nltk.tokenize.word_tokenize(dreams[idx])  # Split into words.
    dreams[idx] = nltk.pos_tag(dreams[idx])  # tag with PoS
    dreams[idx] = [token for token, tag in dreams[idx] if tag.startswith('N')]   # only keep nouns 
    
# Remove numbers, but not words that contain numbers.
dreams = [[token for token in dream if not token.isdigit()] for dream in dreams]

# Remove words that are only one or two characters.
dreams = [[token for token in dream if len(token) > 2] for dream in dreams]

In [None]:
# Lemmatize the dreams.
lmtzr = WordNetLemmatizer()
dreams = [[lmtzr.lemmatize(token) for token in dream] for dream in dreams]

In [None]:
# Remove rare and common tokens.

# Create a dictionary representation of the dreams.
dictionary = Dictionary(dreams)

# Filter out words that occur less than 10 dreams, or more than 60% of the dreams.
dictionary.filter_extremes(no_below=10, no_above=0.6)

# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(dream) for dream in dreams]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of dreams: %d' % len(corpus))

__BELOW IS FROM: https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html__

__HELPS EXPLAIN THE PARAMETERS__

## Training

*We are ready to train the LDA model. We will first discuss how to set some of the training parameters.*

*First of all, the elephant in the room: how many topics do I need? There is really no easy answer for this, it will depend on both your data and your application. I have used 10 topics here because I wanted to have a few topics that I could interpret and "label", and because that turned out to give me reasonably good results. You might not need to interpret all your topics, so you could use a large number of topics, for example 100.*

*The `chunksize` controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory. I've set chunksize = 2000, which is more than the amount of documents, so I process all the data in one go. Chunksize can however influence the quality of the model, as discussed in Hoffman and co-authors [2], but the difference was not substantial in this case.*

*`passes` controls how often we train the model on the entire corpus. Another word for passes might be "epochs". iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of "passes" and "iterations" high enough.*

*I suggest the following way to choose iterations and passes. First, enable logging (as described in many Gensim tutorials), and set eval_every = 1 in LdaModel. When training the model look for a line in the log that looks something like this: *

    `2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations`

*If you set `passes` = 20 you will see this line 20 times. Make sure that by the final passes, most of the documents have converged. So you want to choose both passes and iterations to be high enough for this to happen.*

*We set `alpha = 'auto'` and `eta = 'auto'`. Again this is somewhat technical, but essentially we are automatically learning two parameters in the model that we usually would have to specify explicitly. *

In [None]:
# Train LDA model.

# Set training parameters.
num_topics = 20
chunksize = 20000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

In [None]:
top_topics = model.top_topics(corpus, topn=10)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

In [None]:
# shape: num_topics x vocabulary_size
# print model.get_topics()

model.print_topics()

In [None]:
# df.groupby('W266ID').agg(sum)

In [None]:
### Run Topic Modeling on each dreamer separately
import time

# dict to store topic models for each dreamer
topic_models = dict()

for dreamer_id in range(1,41):
    loop_start_time = time.time()
    
    dreams = list(df[df['W266ID'] == dreamer_id]['Dream'])

    # Split the documents into tokens.
    for idx in range(len(dreams)):
        dreams[idx] = dreams[idx].lower()  # Convert to lowercase.
        dreams[idx] = nltk.tokenize.word_tokenize(dreams[idx])  # Split into words.
        dreams[idx] = nltk.pos_tag(dreams[idx])  # tag with PoS
        dreams[idx] = [token for token, tag in dreams[idx] if tag.startswith('N')]   # only keep nouns 

    # Remove numbers, but not words that contain numbers.
    dreams = [[token for token in dream if not token.isdigit()] for dream in dreams]

    # Remove words that are only one or two characters.
    dreams = [[token for token in dream if len(token) > 2] for dream in dreams]

    # Lemmatize the dreams.
    lmtzr = WordNetLemmatizer()
    dreams = [[lmtzr.lemmatize(token) for token in dream] for dream in dreams]


    ## Remove rare and common tokens.

    # Create a dictionary representation of the dreams.
    dictionary = Dictionary(dreams)

    # Filter out words that occur less than 4 dreams
    dictionary.filter_extremes(no_below=4, no_above=0.6)

    # Vectorize data.
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(dream) for dream in dreams]

    # Set training parameters.
    num_topics = 10
    chunksize = 3000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    mode_start_time = time.time()
    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                           alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, \
                           passes=passes, eval_every=eval_every)
    print 'DreamerID', str(dreamer_id) + ': Total cell time=' + str(time.time() - loop_start_time), \
            '\t Model build time=' + str(time.time() - mode_start_time)
    
    topic_models[dreamer_id] = (model, corpus, dictionary)
    


In [None]:
for k, v in topic_models.iteritems():
    print "\nDreamer", k
    pprint(v[0].top_topics(corpus = v[1], topn=5))

## Model 2: Full dreambank as the corpus
First attempt: not doing any normalization to deal with unbalanced dreamer data
